diff --git a/.gitignore b/.gitignore index e559128..51e3a62 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,15 @@ venv/ .venv/ LottoSpider/ *.log + +# 运行时生成的数据文件 +.fetch_status.json +.generation_records.json +lottery/ + +# 备份文件 +*.bak +*.bak.* + +# 临时文件 +*.tmp diff --git a/app.py b/app.py index 2b6dae9..8666b3e 100644 --- a/app.py +++ b/app.py @@ -39,6 +39,10 @@ CONFIG = { 'auth_enabled': False, 'max_tickets': 1000, 'default_tickets': 10, + # 数据抓取配置(原 web_executor.py 功能) + 'fetch_script': os.path.join(BASE_DIR, 'fetch_data.py'), + 'fetch_status_file': os.path.join(BASE_DIR, '.fetch_status.json'), + 'fetch_timeout': 300, # 抓取超时秒数 } # ============================================================ @@ -98,6 +102,50 @@ def add_record(strategy, num_tickets, filename): os.replace(tmp_path, CONFIG['records_file']) return new_record +# ============================================================ +# Excel 历史数据读取辅助 +# ============================================================ +# 标准列名 (与 lottery.py 兼容) +HISTORY_COLUMNS = ['开奖时间', '期数', '号码', '开机号', '和值特征', '奇偶比', '大小比', '奇偶形态', '跨度', '其他'] + +def load_history_dataframe(): + """智能加载历史数据 Excel,兼容新旧两种格式。 + + 新格式 (fetch_data.py 修复后): 第一行是标准列名,数据从第二行开始。 + 旧格式 (修复前): 两行 header,第一行英文列名,第二行中文描述行。 + + 返回的 DataFrame 统一使用标准列名,数据已跳过所有 header 行。 + """ + import pandas as pd + df = pd.read_excel(CONFIG['history_file'], header=None) + + # 检测第一行是否包含标准列名 + first_row = df.iloc[0].astype(str).tolist() + is_standard_header = any(col in first_row for col in ['开奖时间', '期数', '号码']) + + if is_standard_header: + # 新格式: 第一行是标准列名,直接使用 + data_df = df.iloc[1:].copy() + num_cols = min(len(data_df.columns), len(HISTORY_COLUMNS)) + data_df.columns = HISTORY_COLUMNS[:num_cols] + [f'col_{i}' for i in range(num_cols, len(data_df.columns))] + else: + # 旧格式: 检查是否有两行 header + second_row = df.iloc[1].astype(str).tolist() if len(df) > 1 else [] + has_second_header = any(col in second_row for col in ['开奖时间', '期数', '号码']) + + if has_second_header: + # 两行 header,跳过前两行 + data_df = df.iloc[2:].copy() + else: + # 只有一行 header,跳过第一行 + data_df = df.iloc[1:].copy() + + num_cols = min(len(data_df.columns), len(HISTORY_COLUMNS)) + data_df.columns = HISTORY_COLUMNS[:num_cols] + [f'col_{i}' for i in range(num_cols, len(data_df.columns))] + + data_df = data_df.reset_index(drop=True) + return data_df + # ============================================================ # 认证装饰器(可选) # ============================================================ @@ -195,10 +243,8 @@ def get_statistics_data(generator=None): if not os.path.exists(CONFIG['history_file']): return {} - # 直接解析 Excel,跳过描述行 - df = pd.read_excel(CONFIG['history_file'], header=None) - data_df = df.iloc[1:].copy() - data_df.columns = ['开奖日期', '期号', '红球', '开机号', '和值特征', '奇偶形态', '大小比', '奇偶形态2', '跨度', '其他'] + # 使用智能加载函数 + data_df = load_history_dataframe() # 解析红球和蓝球 red_ball_counts = Counter() @@ -207,7 +253,7 @@ def get_statistics_data(generator=None): span_values = [] for _, row in data_df.iterrows(): - s = str(row['红球']).strip() + s = str(row['号码']).strip() if len(s) >= 14: reds = [int(s[i:i+2]) for i in range(0, 12, 2)] blue = int(s[12:14]) @@ -232,7 +278,7 @@ def get_statistics_data(generator=None): odd_even_ratios = Counter() size_ratios = Counter() for _, row in data_df.iterrows(): - oe = str(row['奇偶形态']).strip() + oe = str(row['奇偶比']).strip() sz = str(row['大小比']).strip() if oe and oe != 'nan': odd_even_ratios[oe] += 1 @@ -400,14 +446,11 @@ def api_history(): import pandas as pd import re - df = pd.read_excel(CONFIG['history_file'], header=None) - # 第一行是描述行,跳过 - data_df = df.iloc[1:].copy() - data_df.columns = ['开奖日期', '期号', '红球', '开机号', '和值特征', '奇偶形态', '大小比', '奇偶形态2', '跨度', '其他'] - data_df = data_df.reset_index(drop=True) + # 使用智能加载函数 + data_df = load_history_dataframe() - # 解析红球(红球列是6个红球+蓝球的拼接字符串,如 '09101316192108') + # 解析红球 (号码列是 6 红球+1 蓝球的拼接字符串,如 '09101316192108') def parse_red_balls(val): s = str(val).strip() if len(s) >= 12: @@ -420,8 +463,8 @@ def api_history(): return int(s[12:14]) return None - data_df['红球列表'] = data_df['红球'].apply(parse_red_balls) - data_df['蓝球'] = data_df['红球'].apply(parse_blue_ball) + data_df['红球列表'] = data_df['号码'].apply(parse_red_balls) + data_df['蓝球'] = data_df['号码'].apply(parse_blue_ball) # 搜索过滤 if search: @@ -442,13 +485,13 @@ def api_history(): for _, row in page_df.iterrows(): reds = row['红球列表'] record = { - '开奖日期': str(row['开奖日期']), - '期号': str(row['期号']), + '开奖日期': str(row['开奖时间']), + '期号': str(row['期数']), '红球': reds if len(reds) == 6 else [], '蓝球': row['蓝球'], '开机号': str(row['开机号']), '和值': str(row['和值特征']), - '奇偶形态': str(row['奇偶形态']), + '奇偶形态': str(row['奇偶比']), '大小比': str(row['大小比']), '跨度': str(row['跨度']), } @@ -524,18 +567,161 @@ def api_config(): }) +# ============================================================ +# 数据抓取控制台(原 web_executor.py 功能整合) +# ============================================================ +# 全局抓取状态 +fetch_status = { + "is_running": False, + "last_update": None, + "last_record_count": 0, + "last_error": None +} +fetch_lock = threading.Lock() + +def load_fetch_status(): + """从文件加载抓取状态""" + global fetch_status + if os.path.exists(CONFIG['fetch_status_file']): + try: + with open(CONFIG['fetch_status_file'], 'r', encoding='utf-8') as f: + saved = json.load(f) + with fetch_lock: + # 保留当前 is_running 状态(运行中不覆盖) + running = fetch_status.get('is_running', False) + fetch_status = saved + fetch_status['is_running'] = running + except (json.JSONDecodeError, IOError): + pass + +def save_fetch_status(): + """保存抓取状态到文件""" + with fetch_lock: + with open(CONFIG['fetch_status_file'], 'w', encoding='utf-8') as f: + json.dump(fetch_status, f, ensure_ascii=False, indent=2) + +@app.route('/fetch') +def fetch_console(): + """数据抓取控制台页面""" + return send_from_directory(BASE_DIR, 'web_console.html') + +@app.route('/api/fetch/status') +def api_fetch_status(): + """获取抓取执行状态""" + with fetch_lock: + return jsonify({ + "success": True, + "isRunning": fetch_status.get("is_running", False), + "lastUpdate": fetch_status.get("last_update"), + "recordCount": fetch_status.get("last_record_count", 0), + "lastError": fetch_status.get("last_error") + }) + +@app.route('/api/fetch/execute', methods=['POST']) +def api_fetch_execute(): + """触发数据抓取""" + global fetch_status + + with fetch_lock: + if fetch_status.get("is_running", False): + return jsonify({ + "success": False, + "error": "任务正在执行中,请稍后再试" + }), 409 + + # 启动后台执行线程 + def run_fetch_script(): + global fetch_status + + with fetch_lock: + fetch_status["is_running"] = True + fetch_status["last_error"] = None + save_fetch_status() + + try: + import subprocess + print(f"[{datetime.now()}] 开始执行抓取脚本...") + + result = subprocess.run( + [sys.executable, CONFIG['fetch_script']], + capture_output=True, + text=True, + timeout=CONFIG['fetch_timeout'] + ) + + if result.returncode == 0: + # 解析输出获取记录数 + record_count = 0 + for line in result.stdout.split('\n'): + if '共保存' in line and '条记录' in line: + try: + record_count = int(line.split('共保存')[1].split('条记录')[0].strip()) + except ValueError: + pass + elif '成功解析' in line and '条数据' in line: + try: + record_count = int(line.split('成功解析')[1].split('条数据')[0].strip()) + except ValueError: + pass + + with fetch_lock: + fetch_status["last_update"] = datetime.now().isoformat() + fetch_status["last_record_count"] = record_count + fetch_status["is_running"] = False + save_fetch_status() + + print(f"✅ 抓取成功,共 {record_count} 条数据") + + else: + error_msg = result.stderr or f"脚本执行失败,返回码:{result.returncode}" + with fetch_lock: + fetch_status["last_error"] = error_msg + fetch_status["is_running"] = False + save_fetch_status() + print(f"❌ {error_msg}") + + except subprocess.TimeoutExpired: + error_msg = f"脚本执行超时(超过 {CONFIG['fetch_timeout']} 秒)" + with fetch_lock: + fetch_status["last_error"] = error_msg + fetch_status["is_running"] = False + save_fetch_status() + print(f"❌ {error_msg}") + + except Exception as e: + error_msg = f"执行异常:{str(e)}" + with fetch_lock: + fetch_status["last_error"] = error_msg + fetch_status["is_running"] = False + save_fetch_status() + print(f"❌ {error_msg}") + + thread = threading.Thread(target=run_fetch_script, daemon=True) + thread.start() + + return jsonify({ + "success": True, + "message": "任务已启动,正在执行中..." + }) + + # ============================================================ # 启动服务 # ============================================================ if __name__ == '__main__': + # 加载抓取状态 + load_fetch_status() + print('=' * 60) - print('🎯 双色球 Web UI 服务') + print('🎯 双色球 Web UI 服务(统一)') print('=' * 60) print(f'\n📂 项目路径: {BASE_DIR}') print(f'📁 历史数据: {CONFIG["history_file"]}') print(f'📁 生成目录: {CONFIG["lottery_output_dir"]}') + print(f'📁 抓取脚本: {CONFIG["fetch_script"]}') print(f'\n🌐 服务地址: http://{CONFIG["host"]}:{CONFIG["port"]}') print(f' 局域网访问: http://<本机IP>:{CONFIG["port"]}') + print(f' 抓取控制台: http://<本机IP>:{CONFIG["port"]}/fetch') print(f'\n✅ 服务就绪!') print('=' * 60) diff --git a/deploy/lotto-app.service b/deploy/lotto-app.service new file mode 100644 index 0000000..df4d6c1 --- /dev/null +++ b/deploy/lotto-app.service @@ -0,0 +1,17 @@ +[Unit] +Description=双色球号码生成 Web 服务 (app.py :8085) +After=network.target + +[Service] +Type=simple +User=vincent +WorkingDirectory=/home/vincent/Studio/lottoData +ExecStart=/home/vincent/Studio/lottoData/venv/bin/python3 /home/vincent/Studio/lottoData/app.py +ExecStartPre=/home/vincent/Studio/lottoData/venv/bin/python3 -c "import flask; import pandas; import openpyxl; import numpy" +Restart=on-failure +RestartSec=5 +KillMode=control-group +Environment=PYTHONUNBUFFERED=1 + +[Install] +WantedBy=multi-user.target diff --git a/fetch_data.py b/fetch_data.py index 8867d75..83187a6 100644 --- a/fetch_data.py +++ b/fetch_data.py @@ -7,6 +7,7 @@ """ import requests +import time from bs4 import BeautifulSoup import pandas as pd from datetime import datetime @@ -27,77 +28,120 @@ HEADERS = { } +# 请求间隔(秒),避免被封 IP +REQUEST_DELAY = 2 +# 最大重试次数 +MAX_RETRIES = 3 +# 重试间隔(秒) +RETRY_DELAY = 5 + + def fetch_lottery_data(): """抓取双色球历史数据""" print(f"[{datetime.now()}] 开始抓取数据...") - try: - response = requests.get(URL, headers=HEADERS, timeout=30) - response.raise_for_status() - response.encoding = "utf-8" - - soup = BeautifulSoup(response.text, "html.parser") - - # 查找表格数据 - table = soup.find("table") - if not table: - print("错误:未找到数据表格") + last_error = None + for attempt in range(1, MAX_RETRIES + 1): + try: + # 请求前等待,避免频繁请求 + if attempt > 1: + print(f" 第 {attempt} 次重试,等待 {RETRY_DELAY} 秒...") + time.sleep(RETRY_DELAY) + else: + time.sleep(REQUEST_DELAY) + + response = requests.get(URL, headers=HEADERS, timeout=30) + response.raise_for_status() + response.encoding = "utf-8" + + soup = BeautifulSoup(response.text, "html.parser") + + # 查找表格数据 + table = soup.find("table") + if not table: + print("错误:未找到数据表格") + return None + + # 解析表格 + data_rows = [] + rows = table.find_all("tr") + + for row in rows: + cols = row.find_all(["td", "th"]) + if len(cols) >= 8: + try: + row_data = [col.get_text(strip=True) for col in cols] + data_rows.append(row_data) + except Exception: + continue + + if not data_rows: + print("错误:未解析到任何数据") + return None + + print(f"成功解析 {len(data_rows)} 条数据") + return data_rows + + except requests.exceptions.RequestException as e: + last_error = e + print(f"网络请求错误(第 {attempt} 次):{e}") + if attempt < MAX_RETRIES: + continue return None - - # 解析表格 - data_rows = [] - rows = table.find_all("tr") - - for row in rows: - cols = row.find_all(["td", "th"]) - if len(cols) >= 8: - try: - row_data = [col.get_text(strip=True) for col in cols] - data_rows.append(row_data) - except Exception as e: - continue - - if not data_rows: - print("错误:未解析到任何数据") + except Exception as e: + last_error = e + print(f"解析错误(第 {attempt} 次):{e}") + if attempt < MAX_RETRIES: + continue return None - - print(f"成功解析 {len(data_rows)} 条数据") - return data_rows - - except requests.exceptions.RequestException as e: - print(f"网络请求错误:{e}") - return None - except Exception as e: - print(f"解析错误:{e}") - return None + + print(f"抓取失败,共尝试 {MAX_RETRIES} 次,最后错误:{last_error}") + return None def save_to_excel(data_rows): - """保存数据到 Excel 文件""" + """保存数据到 Excel 文件 + + 输出格式与 lottery.py 和 app.py 兼容: + - 列名: 开奖时间 | 期数 | 号码 | 开机号 | 和值特征 | 奇偶比 | 大小比 | 奇偶形态 | 跨度 | 其他 + - 号码列为 6 红球 + 1 蓝球的拼接字符串 (如 '09101316192108') + - 第一行为列名,数据从第二行开始 + """ if not data_rows: print("无数据可保存") return False - + try: - # 创建 DataFrame + # 跳过网页表格的 header 行 (第一行通常是中文标题) + # 检查第一行是否是 header (包含 '开奖时间' 或 '期数' 等关键词) + first_row = data_rows[0] + header_keywords = {'开奖时间', '期数', '号码', '开奖日期'} + if any(kw in str(first_row) for kw in header_keywords): + data_rows = data_rows[1:] + print(f"跳过 header 行,实际数据 {len(data_rows)} 条") + + # 标准列名 (与 lottery.py 期望一致) + standard_columns = ['开奖时间', '期数', '号码', '开机号', '和值特征', '奇偶比', '大小比', '奇偶形态', '跨度', '其他'] + + # 统一每行列数 num_cols = min(len(row) for row in data_rows) data_rows = [row[:num_cols] for row in data_rows] - - # 列名定义(最多 11 列) - columns = ["期号", "开奖日期", "红球 1", "红球 2", "红球 3", "红球 4", "红球 5", "红球 6", "蓝球", "特别号", "奖池"] - - # 如果列数不匹配,使用通用列名 - actual_columns = columns[:num_cols] if num_cols <= len(columns) else [f"列{i+1}" for i in range(num_cols)] - + + # 使用标准列名 (截取或补全) + if num_cols <= len(standard_columns): + actual_columns = standard_columns[:num_cols] + else: + actual_columns = standard_columns + [f'列{i+1}' for i in range(num_cols - len(standard_columns))] + df = pd.DataFrame(data_rows, columns=actual_columns) - + # 保存为 Excel df.to_excel(OUTPUT_FILE, index=False, engine="openpyxl") - + print(f"[{datetime.now()}] 数据已保存到:{OUTPUT_FILE}") print(f"共保存 {len(df)} 条记录") return True - + except Exception as e: print(f"保存 Excel 错误:{e}") return False diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ab4f7f3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +flask>=3.0 +pandas>=2.0 +numpy>=1.24 +openpyxl>=3.1 +requests>=2.31 +beautifulsoup4>=4.12 diff --git a/web_console.html b/web_console.html index d3ec63a..cce09ab 100644 --- a/web_console.html +++ b/web_console.html @@ -192,7 +192,7 @@
/Users/vincent/Studio/lottoData/双色球历史数据.xlsx/home/vincent/Studio/lottoData/双色球历史数据.xlsx/Users/vincent/Studio/lottoData/fetch_data.py |
+ 脚本路径:/home/vincent/Studio/lottoData/fetch_data.py |
输出文件:双色球历史数据.xlsx