EnterpriseArchitect/scripts/detect_redundancy.py

#!/usr/bin/env python3
"""
配置文件冗余检测脚本
检测 Markdown 文件中的内联冗余内容

用法：
  python3 detect_redundancy.py <file_path>
"""

import sys
import os
import hashlib

def detect_inline_blocks(file_path, threshold=20):
    """检测超过阈值行的内联内容块"""

    if not os.path.exists(file_path):
        print(f"❌ 文件不存在：{file_path}")
        return []

    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    blocks = []
    current_block = []
    in_code_block = False
    in_heading = False

    for i, line in enumerate(lines):
        stripped = line.strip()

        # 检测代码块
        if stripped.startswith('```'):
            in_code_block = not in_code_block
            continue

        # 跳过代码块内容
        if in_code_block:
            continue

        # 检测标题行
        is_heading = stripped.startswith('#')

        # 检测空行
        is_empty = not stripped

        # 密度内容：非空、非标题、非代码块的连续行
        if stripped and not is_heading and not in_code_block:
            if not current_block or (not is_empty and current_block[-1][1].strip()):
                current_block.append((i + 1, line))
            else:
                # 遇到空行，结束当前块
                if len(current_block) >= threshold:
                    blocks.append({
                        'start_line': current_block[0][0],
                        'end_line': current_block[-1][0],
                        'line_count': len(current_block),
                        'preview': ''.join([l for _, l in current_block[:3]]).strip()[:100]
                    })
                current_block = []
        else:
            # 空行或标题行，结束当前块
            if len(current_block) >= threshold:
                blocks.append({
                    'start_line': current_block[0][0],
                    'end_line': current_block[-1][0],
                    'line_count': len(current_block),
                    'preview': ''.join([l for _, l in current_block[:3]]).strip()[:100]
                })
            current_block = []

    # 处理最后一个块
    if len(current_block) >= threshold:
        blocks.append({
            'start_line': current_block[0][0],
            'end_line': current_block[-1][0],
            'line_count': len(current_block),
            'preview': ''.join([l for _, l in current_block[:3]]).strip()[:100]
        })

    return blocks


def estimate_token_savings(lines_count):
    """估算 Token 节省（粗略：1 行 ≈ 10 tokens）"""
    return lines_count * 10


def format_report(file_path, blocks):
    """格式化检测报告"""

    if not blocks:
        return f"✅ {file_path}: 未发现冗余内容块\n"

    report = [f"📋 {file_path}"]
    total_lines = 0

    for i, block in enumerate(blocks, 1):
        report.append(f"\n  ⚠️  建议 {i}:")
        report.append(f"     位置：第 {block['start_line']}-{block['end_line']} 行 ({block['line_count']}行)")
        report.append(f"     预估节省：{estimate_token_savings(block['line_count'])} tokens")
        report.append(f"     预览：{block['preview']}...")
        total_lines += block['line_count']

    report.append(f"\n  📊 合计可节省：~{estimate_token_savings(total_lines)} tokens\n")

    return '\n'.join(report)


def main():
    if len(sys.argv) < 2:
        print("用法：python3 detect_redundancy.py <file_path> [file_path...]")
        print("示例：python3 detect_redundancy.py AGENTS.md SOUL.md")
        sys.exit(1)

    print("=" * 60)
    print("配置文件冗余检测报告")
    print("=" * 60)
    print()

    total_savings = 0
    total_blocks = 0

    for file_path in sys.argv[1:]:
        blocks = detect_inline_blocks(file_path)
        report = format_report(file_path, blocks)
        print(report)

        for block in blocks:
            total_savings += estimate_token_savings(block['line_count'])
            total_blocks += 1

    print("=" * 60)
    print(f"📊 汇总：发现 {total_blocks} 个冗余块，预估节省 {total_savings} tokens")
    print("=" * 60)


if __name__ == '__main__':
    main()