Lottery/history_loader.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
history_loader.py — 双色球历史数据 Excel 公共加载模块

统一 Excel 格式检测和列名标准化逻辑，供 lottery.py 和 app.py 共同引用。
支持三种格式：
  格式A: Row0=新列名(期号|开奖日期|红球1~6|蓝球|特别号), Row1=旧列名(开奖时间|期数|号码|...), Row2+=数据
  格式B: Row0=标准列名(开奖时间|期数|号码|开机号|...), Row1+=数据
  格式C: 直接含"号码"列的标准 DataFrame
"""

import pandas as pd
import os
from collections import Counter
import re

# 标准列名（lottery.py 和 app.py 期望的列）
LEGACY_COLUMNS = ['开奖时间', '期数', '号码', '开机号', '和值特征', '奇偶比', '大小比', '奇偶形态', '跨度', '其他']


def load_history_dataframe(history_file):
    """智能加载历史数据 Excel，兼容多种格式。

    返回统一的 DataFrame，使用 LEGACY_COLUMNS 列名。
    """
    if not os.path.exists(history_file):
        return pd.DataFrame()

    raw_df = pd.read_excel(history_file, header=None)

    if raw_df.empty:
        return pd.DataFrame()

    row0_vals = raw_df.iloc[0].astype(str).tolist() if len(raw_df) > 0 else []
    row1_vals = raw_df.iloc[1].astype(str).tolist() if len(raw_df) > 1 else []

    has_legacy_in_row0 = any(col in row0_vals for col in ['开奖时间', '期数', '号码'])
    has_legacy_in_row1 = any(col in row1_vals for col in ['开奖时间', '期数', '号码'])
    has_new_cols_in_row0 = any(col in row0_vals for col in ['期号', '开奖日期', '红球 1'])

    if has_new_cols_in_row0 and has_legacy_in_row1:
        # 格式A：跳过 Row0(新列名) 和 Row1(旧列名)，用旧列名，数据从 Row2 开始
        data_df = raw_df.iloc[2:].copy()
        num_cols = len(data_df.columns)
        data_df.columns = LEGACY_COLUMNS[:min(num_cols, len(LEGACY_COLUMNS))] + \
            [f'col_{i}' for i in range(min(num_cols, len(LEGACY_COLUMNS)), num_cols)]
    elif has_legacy_in_row0:
        # 格式B：Row0 就是标准列名
        data_df = raw_df.iloc[1:].copy()
        num_cols = len(data_df.columns)
        data_df.columns = LEGACY_COLUMNS[:min(num_cols, len(LEGACY_COLUMNS))] + \
            [f'col_{i}' for i in range(min(num_cols, len(LEGACY_COLUMNS)), num_cols)]
    else:
        # 格式C：尝试默认读取
        data_df = pd.read_excel(history_file)
        if '号码' not in data_df.columns:
            # 可能是分列格式，尝试构建号码列
            red_cols = [f'红球 {i}' for i in range(1, 7)]
            if not all(c in data_df.columns for c in red_cols):
                red_cols = [f'红球{i}' for i in range(1, 7)]
            if all(c in data_df.columns for c in red_cols) and '蓝球' in data_df.columns:
                data_df['号码'] = data_df.apply(
                    lambda row: _build_number_string(row, red_cols), axis=1)

        num_cols = len(data_df.columns)
        if not any(c in data_df.columns for c in LEGACY_COLUMNS[:3]):
            data_df.columns = LEGACY_COLUMNS[:min(num_cols, len(LEGACY_COLUMNS))] + \
                [f'col_{i}' for i in range(min(num_cols, len(LEGACY_COLUMNS)), num_cols)]

    data_df = data_df.reset_index(drop=True)
    return data_df


def _build_number_string(row, red_cols):
    """将分列红球 + 蓝球拼接为 14 位号码字符串。"""
    parts = []
    for c in red_cols:
        val = row.get(c)
        if pd.isna(val):
            return None
        s = str(int(val)) if isinstance(val, (int, float)) else str(val).strip()
        parts.append(s.zfill(2))
    blue_val = row.get('蓝球')
    if pd.isna(blue_val):
        return None
    blue_s = str(int(blue_val)) if isinstance(blue_val, (int, float)) else str(blue_val).strip()
    return ''.join(parts) + blue_s.zfill(2)


def parse_number_string(numbers_str):
    """解析号码字符串为 (红球列表, 蓝球)。

    支持以下格式：
    - 拼接字符串: '08121821243001' (6红球×2位 + 1蓝球×2位)
    - 加号分隔: '03,12,16,22,25,28+10'
    - 空格/逗号分隔: '08 12 18 21 24 30 01'
    """
    if not numbers_str or pd.isna(numbers_str):
        return [], 0

    s = str(numbers_str).strip()

    # 情况1: 纯拼接字符串（14位或以上，无分隔符）
    if re.match(r'^\d{14,}$', s):
        red_balls = [int(s[i:i+2]) for i in range(0, 12, 2)]
        blue_ball = int(s[12:14])
        if all(1 <= b <= 33 for b in red_balls) and 1 <= blue_ball <= 16:
            return red_balls, blue_ball
        return [], 0

    # 情况2: 加号分隔
    if '+' in s:
        parts = s.replace(',', ' ').replace('+', ' ').split()
        if len(parts) >= 7:
            try:
                red_balls = [int(x) for x in parts[:6]]
                blue_ball = int(parts[6])
                if all(1 <= b <= 33 for b in red_balls) and 1 <= blue_ball <= 16:
                    return red_balls, blue_ball
            except ValueError:
                pass
        return [], 0

    # 情况3: 正则提取数字
    number_list = re.findall(r'\d+', s)
    if len(number_list) >= 7:
        try:
            red_balls = [int(x) for x in number_list[:6]]
            blue_ball = int(number_list[6])
            if all(1 <= b <= 33 for b in red_balls) and 1 <= blue_ball <= 16:
                return red_balls, blue_ball
        except ValueError:
            pass

    return [], 0


def compute_statistics(history_file):
    """从历史数据 Excel 计算统计信息，返回字典。"""
    if not os.path.exists(history_file):
        return {}

    data_df = load_history_dataframe(history_file)

    red_ball_counts = Counter()
    blue_ball_counts = Counter()
    sum_values = []
    span_values = []

    for _, row in data_df.iterrows():
        s = str(row.get('号码', '')).strip()
        if len(s) >= 14:
            reds = [int(s[i:i+2]) for i in range(0, 12, 2)]
            blue = int(s[12:14])
            if all(1 <= r <= 33 for r in reds) and 1 <= blue <= 16:
                red_ball_counts.update(reds)
                blue_ball_counts[blue] += 1
                sum_values.append(sum(reds))
                span_values.append(max(reds) - min(reds))

    stats = {}

    if red_ball_counts:
        sorted_reds = sorted(red_ball_counts.items(), key=lambda x: x[1], reverse=True)
        stats['hot_reds'] = [x[0] for x in sorted_reds[:15]]
        stats['cold_reds'] = [x[0] for x in sorted_reds[-15:]]

    if blue_ball_counts:
        sorted_blues = sorted(blue_ball_counts.items(), key=lambda x: x[1], reverse=True)
        stats['hot_blues'] = [x[0] for x in sorted_blues[:8]]

    # 奇偶比/大小比统计
    odd_even_ratios = Counter()
    size_ratios = Counter()
    for _, row in data_df.iterrows():
        oe = str(row.get('奇偶比', '')).strip()
        sz = str(row.get('大小比', '')).strip()
        if oe and oe != 'nan':
            odd_even_ratios[oe] += 1
        if sz and sz != 'nan':
            size_ratios[sz] += 1

    if odd_even_ratios:
        stats['common_odd_even'] = max(odd_even_ratios, key=odd_even_ratios.get)
    if size_ratios:
        stats['common_size_ratio'] = max(size_ratios, key=size_ratios.get)

    if sum_values:
        import numpy as np
        arr = np.array(sum_values)
        stats['sum_range'] = {
            'min': int(arr.min()), 'max': int(arr.max()),
            'mean': float(arr.mean()), 'std': float(arr.std())
        }

    if span_values:
        import numpy as np
        arr = np.array(span_values)
        stats['span_range'] = {
            'min': int(arr.min()), 'max': int(arr.max()),
            'mean': float(arr.mean()), 'std': float(arr.std())
        }

    stats['history_count'] = len(data_df)
    return stats