fix: 修复代码审查反馈全部问题

审查意见修复清单: P1 列映射语义修复 (lottery.py): - _normalize_history_format() 不再将红球2-6映射到开机号/和值特征/奇偶比等格式A不含这些特征字段，缺失列留空，前端做降级显示 - 删除已用于构建号码列的原始分列，避免数据重复 P2 架构优化: - 提取 Excel 兼容逻辑到公共模块 history_loader.py lottery.py 和 app.py 共同引用，消除三处重复代码 - web_executor.py 标记为已废弃，功能已整合到 app.py 部署修复: - 删除 deploy/lotto-web.service (旧服务)，仅保留 lotto-app.service - 更新 deploy/DEPLOY.md: 端口5000→8085, 接口清单更新, 添加迁移说明安全加固: - API Token 改为环境变量读取: os.environ.get('LOTTO_API_TOKEN') - 错误信息不再暴露内部异常，改为通用错误消息+日志记录 - 目录遍历防护改用 os.path.realpath 检查最终路径其他: - .gitignore 补充排除双色球历史数据.xlsx - app.py 引用公共模块，简化 get_statistics_data 和 load_history_dataframe 测试验证: 全部 API 测试通过，120条历史数据正确解析 Issue: BIZ-75
2026-07-04 01:28:57 +08:00
parent 5d5e77000e
commit 5cebbfa433
7 changed files with 370 additions and 270 deletions
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+history_loader.py — 双色球历史数据 Excel 公共加载模块
+
+统一 Excel 格式检测和列名标准化逻辑，供 lottery.py 和 app.py 共同引用。
+支持三种格式：
+  格式A: Row0=新列名(期号|开奖日期|红球1~6|蓝球|特别号), Row1=旧列名(开奖时间|期数|号码|...), Row2+=数据
+  格式B: Row0=标准列名(开奖时间|期数|号码|开机号|...), Row1+=数据
+  格式C: 直接含"号码"列的标准 DataFrame
+"""
+
+import pandas as pd
+import os
+from collections import Counter
+import re
+
+# 标准列名（lottery.py 和 app.py 期望的列）
+LEGACY_COLUMNS = ['开奖时间', '期数', '号码', '开机号', '和值特征', '奇偶比', '大小比', '奇偶形态', '跨度', '其他']
+
+
+def load_history_dataframe(history_file):
+    """智能加载历史数据 Excel，兼容多种格式。
+
+    返回统一的 DataFrame，使用 LEGACY_COLUMNS 列名。
+    """
+    if not os.path.exists(history_file):
+        return pd.DataFrame()
+
+    raw_df = pd.read_excel(history_file, header=None)
+
+    if raw_df.empty:
+        return pd.DataFrame()
+
+    row0_vals = raw_df.iloc[0].astype(str).tolist() if len(raw_df) > 0 else []
+    row1_vals = raw_df.iloc[1].astype(str).tolist() if len(raw_df) > 1 else []
+
+    has_legacy_in_row0 = any(col in row0_vals for col in ['开奖时间', '期数', '号码'])
+    has_legacy_in_row1 = any(col in row1_vals for col in ['开奖时间', '期数', '号码'])
+    has_new_cols_in_row0 = any(col in row0_vals for col in ['期号', '开奖日期', '红球 1'])
+
+    if has_new_cols_in_row0 and has_legacy_in_row1:
+        # 格式A：跳过 Row0(新列名) 和 Row1(旧列名)，用旧列名，数据从 Row2 开始
+        data_df = raw_df.iloc[2:].copy()
+        num_cols = len(data_df.columns)
+        data_df.columns = LEGACY_COLUMNS[:min(num_cols, len(LEGACY_COLUMNS))] + \
+            [f'col_{i}' for i in range(min(num_cols, len(LEGACY_COLUMNS)), num_cols)]
+    elif has_legacy_in_row0:
+        # 格式B：Row0 就是标准列名
+        data_df = raw_df.iloc[1:].copy()
+        num_cols = len(data_df.columns)
+        data_df.columns = LEGACY_COLUMNS[:min(num_cols, len(LEGACY_COLUMNS))] + \
+            [f'col_{i}' for i in range(min(num_cols, len(LEGACY_COLUMNS)), num_cols)]
+    else:
+        # 格式C：尝试默认读取
+        data_df = pd.read_excel(history_file)
+        if '号码' not in data_df.columns:
+            # 可能是分列格式，尝试构建号码列
+            red_cols = [f'红球 {i}' for i in range(1, 7)]
+            if not all(c in data_df.columns for c in red_cols):
+                red_cols = [f'红球{i}' for i in range(1, 7)]
+            if all(c in data_df.columns for c in red_cols) and '蓝球' in data_df.columns:
+                data_df['号码'] = data_df.apply(
+                    lambda row: _build_number_string(row, red_cols), axis=1)
+
+        num_cols = len(data_df.columns)
+        if not any(c in data_df.columns for c in LEGACY_COLUMNS[:3]):
+            data_df.columns = LEGACY_COLUMNS[:min(num_cols, len(LEGACY_COLUMNS))] + \
+                [f'col_{i}' for i in range(min(num_cols, len(LEGACY_COLUMNS)), num_cols)]
+
+    data_df = data_df.reset_index(drop=True)
+    return data_df
+
+
+def _build_number_string(row, red_cols):
+    """将分列红球 + 蓝球拼接为 14 位号码字符串。"""
+    parts = []
+    for c in red_cols:
+        val = row.get(c)
+        if pd.isna(val):
+            return None
+        s = str(int(val)) if isinstance(val, (int, float)) else str(val).strip()
+        parts.append(s.zfill(2))
+    blue_val = row.get('蓝球')
+    if pd.isna(blue_val):
+        return None
+    blue_s = str(int(blue_val)) if isinstance(blue_val, (int, float)) else str(blue_val).strip()
+    return ''.join(parts) + blue_s.zfill(2)
+
+
+def parse_number_string(numbers_str):
+    """解析号码字符串为 (红球列表, 蓝球)。
+
+    支持以下格式：
+    - 拼接字符串: '08121821243001' (6红球×2位 + 1蓝球×2位)
+    - 加号分隔: '03,12,16,22,25,28+10'
+    - 空格/逗号分隔: '08 12 18 21 24 30 01'
+    """
+    if not numbers_str or pd.isna(numbers_str):
+        return [], 0
+
+    s = str(numbers_str).strip()
+
+    # 情况1: 纯拼接字符串（14位或以上，无分隔符）
+    if re.match(r'^\d{14,}$', s):
+        red_balls = [int(s[i:i+2]) for i in range(0, 12, 2)]
+        blue_ball = int(s[12:14])
+        if all(1 <= b <= 33 for b in red_balls) and 1 <= blue_ball <= 16:
+            return red_balls, blue_ball
+        return [], 0
+
+    # 情况2: 加号分隔
+    if '+' in s:
+        parts = s.replace(',', ' ').replace('+', ' ').split()
+        if len(parts) >= 7:
+            try:
+                red_balls = [int(x) for x in parts[:6]]
+                blue_ball = int(parts[6])
+                if all(1 <= b <= 33 for b in red_balls) and 1 <= blue_ball <= 16:
+                    return red_balls, blue_ball
+            except ValueError:
+                pass
+        return [], 0
+
+    # 情况3: 正则提取数字
+    number_list = re.findall(r'\d+', s)
+    if len(number_list) >= 7:
+        try:
+            red_balls = [int(x) for x in number_list[:6]]
+            blue_ball = int(number_list[6])
+            if all(1 <= b <= 33 for b in red_balls) and 1 <= blue_ball <= 16:
+                return red_balls, blue_ball
+        except ValueError:
+            pass
+
+    return [], 0
+
+
+def compute_statistics(history_file):
+    """从历史数据 Excel 计算统计信息，返回字典。"""
+    if not os.path.exists(history_file):
+        return {}
+
+    data_df = load_history_dataframe(history_file)
+
+    red_ball_counts = Counter()
+    blue_ball_counts = Counter()
+    sum_values = []
+    span_values = []
+
+    for _, row in data_df.iterrows():
+        s = str(row.get('号码', '')).strip()
+        if len(s) >= 14:
+            reds = [int(s[i:i+2]) for i in range(0, 12, 2)]
+            blue = int(s[12:14])
+            if all(1 <= r <= 33 for r in reds) and 1 <= blue <= 16:
+                red_ball_counts.update(reds)
+                blue_ball_counts[blue] += 1
+                sum_values.append(sum(reds))
+                span_values.append(max(reds) - min(reds))
+
+    stats = {}
+
+    if red_ball_counts:
+        sorted_reds = sorted(red_ball_counts.items(), key=lambda x: x[1], reverse=True)
+        stats['hot_reds'] = [x[0] for x in sorted_reds[:15]]
+        stats['cold_reds'] = [x[0] for x in sorted_reds[-15:]]
+
+    if blue_ball_counts:
+        sorted_blues = sorted(blue_ball_counts.items(), key=lambda x: x[1], reverse=True)
+        stats['hot_blues'] = [x[0] for x in sorted_blues[:8]]
+
+    # 奇偶比/大小比统计
+    odd_even_ratios = Counter()
+    size_ratios = Counter()
+    for _, row in data_df.iterrows():
+        oe = str(row.get('奇偶比', '')).strip()
+        sz = str(row.get('大小比', '')).strip()
+        if oe and oe != 'nan':
+            odd_even_ratios[oe] += 1
+        if sz and sz != 'nan':
+            size_ratios[sz] += 1
+
+    if odd_even_ratios:
+        stats['common_odd_even'] = max(odd_even_ratios, key=odd_even_ratios.get)
+    if size_ratios:
+        stats['common_size_ratio'] = max(size_ratios, key=size_ratios.get)
+
+    if sum_values:
+        import numpy as np
+        arr = np.array(sum_values)
+        stats['sum_range'] = {
+            'min': int(arr.min()), 'max': int(arr.max()),
+            'mean': float(arr.mean()), 'std': float(arr.std())
+        }
+
+    if span_values:
+        import numpy as np
+        arr = np.array(span_values)
+        stats['span_range'] = {
+            'min': int(arr.min()), 'max': int(arr.max()),
+            'mean': float(arr.mean()), 'std': float(arr.std())
+        }
+
+    stats['history_count'] = len(data_df)
+    return stats