#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 双色球历史数据抓取脚本 从 https://www.55128.cn/kjh/fcssq-history-120.htm 抓取数据 更新「双色球历史数据.xlsx」文件 """ import requests import time from bs4 import BeautifulSoup import pandas as pd from datetime import datetime import os import re # 数据源 URL URL = "https://www.55128.cn/kjh/fcssq-history-120.htm" # 输出文件路径 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) OUTPUT_FILE = os.path.join(SCRIPT_DIR, "双色球历史数据.xlsx") # 请求头,模拟浏览器 HEADERS = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", } # 请求间隔(秒),避免被封 IP REQUEST_DELAY = 2 # 最大重试次数 MAX_RETRIES = 3 # 重试间隔(秒) RETRY_DELAY = 5 def fetch_lottery_data(): """抓取双色球历史数据""" print(f"[{datetime.now()}] 开始抓取数据...") last_error = None for attempt in range(1, MAX_RETRIES + 1): try: # 请求前等待,避免频繁请求 if attempt > 1: print(f" 第 {attempt} 次重试,等待 {RETRY_DELAY} 秒...") time.sleep(RETRY_DELAY) else: time.sleep(REQUEST_DELAY) response = requests.get(URL, headers=HEADERS, timeout=30) response.raise_for_status() response.encoding = "utf-8" soup = BeautifulSoup(response.text, "html.parser") # 查找表格数据 table = soup.find("table") if not table: print("错误:未找到数据表格") return None # 解析表格 data_rows = [] rows = table.find_all("tr") for row in rows: cols = row.find_all(["td", "th"]) if len(cols) >= 8: try: row_data = [col.get_text(strip=True) for col in cols] data_rows.append(row_data) except Exception: continue if not data_rows: print("错误:未解析到任何数据") return None print(f"成功解析 {len(data_rows)} 条数据") return data_rows except requests.exceptions.RequestException as e: last_error = e print(f"网络请求错误(第 {attempt} 次):{e}") if attempt < MAX_RETRIES: continue return None except Exception as e: last_error = e print(f"解析错误(第 {attempt} 次):{e}") if attempt < MAX_RETRIES: continue return None print(f"抓取失败,共尝试 {MAX_RETRIES} 次,最后错误:{last_error}") return None def save_to_excel(data_rows): """保存数据到 Excel 文件 输出格式与 lottery.py 和 app.py 兼容: - 列名: 开奖时间 | 期数 | 号码 | 开机号 | 和值特征 | 奇偶比 | 大小比 | 奇偶形态 | 跨度 | 其他 - 号码列为 6 红球 + 1 蓝球的拼接字符串 (如 '09101316192108') - 第一行为列名,数据从第二行开始 """ if not data_rows: print("无数据可保存") return False try: # 跳过网页表格的 header 行 (第一行通常是中文标题) # 检查第一行是否是 header (包含 '开奖时间' 或 '期数' 等关键词) first_row = data_rows[0] header_keywords = {'开奖时间', '期数', '号码', '开奖日期'} if any(kw in str(first_row) for kw in header_keywords): data_rows = data_rows[1:] print(f"跳过 header 行,实际数据 {len(data_rows)} 条") # 标准列名 (与 lottery.py 期望一致) standard_columns = ['开奖时间', '期数', '号码', '开机号', '和值特征', '奇偶比', '大小比', '奇偶形态', '跨度', '其他'] # 统一每行列数 num_cols = min(len(row) for row in data_rows) data_rows = [row[:num_cols] for row in data_rows] # 使用标准列名 (截取或补全) if num_cols <= len(standard_columns): actual_columns = standard_columns[:num_cols] else: actual_columns = standard_columns + [f'列{i+1}' for i in range(num_cols - len(standard_columns))] df = pd.DataFrame(data_rows, columns=actual_columns) # 保存为 Excel df.to_excel(OUTPUT_FILE, index=False, engine="openpyxl") print(f"[{datetime.now()}] 数据已保存到:{OUTPUT_FILE}") print(f"共保存 {len(df)} 条记录") return True except Exception as e: print(f"保存 Excel 错误:{e}") return False def main(): """主函数""" print("=" * 60) print("双色球历史数据抓取工具") print("=" * 60) # 抓取数据 data = fetch_lottery_data() if data: # 保存数据 success = save_to_excel(data) if success: print("=" * 60) print("任务完成!") print("=" * 60) return 0 else: print("保存失败") return 1 else: print("抓取失败") return 1 if __name__ == "__main__": exit(main())