huibao/backend/cleanup_old_data.py

104 lines
3.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
清理旧数据 + 补充缺失的AI总结
"""
import asyncio
import sys
sys.path.insert(0, ".")
from app.database import AsyncSessionLocal, init_db
from app.models import Report, AnalysisResult, ExtractedContent
from app.services.ai_analyzer import ai_analyzer
from sqlalchemy import select, delete
from sqlalchemy.orm import selectinload
async def cleanup_and_fix():
print("=" * 60)
print("数据清理和修复")
print("=" * 60)
await init_db()
async with AsyncSessionLocal() as db:
# 1. 删除2024年之前的报告保留2024和2025年
print("\n1. 清理2024年之前的旧数据...")
stmt = select(Report).where(Report.report_year < 2024)
result = await db.execute(stmt)
old_reports = result.scalars().all()
print(f" 找到 {len(old_reports)} 份2024年前的报告")
for report in old_reports:
# 删除关联数据
await db.execute(delete(ExtractedContent).where(ExtractedContent.report_id == report.id))
await db.execute(delete(AnalysisResult).where(AnalysisResult.report_id == report.id))
await db.delete(report)
await db.commit()
print(f" ✓ 已删除 {len(old_reports)} 份旧报告")
# 2. 为缺失AI总结的报告补充汇总
print("\n2. 为缺失AI总结的报告生成汇总...")
stmt = select(Report).where(
Report.is_extracted == True,
Report.report_year >= 2024
).options(
selectinload(Report.company),
selectinload(Report.analysis_results)
)
result = await db.execute(stmt)
reports = result.scalars().all()
missing_summary_count = 0
fixed_count = 0
for report in reports:
# 检查是否缺少summary
has_summary = any(r.analysis_type == "summary" for r in report.analysis_results)
section_analyses = [r for r in report.analysis_results if r.analysis_type == "section"]
if not has_summary and section_analyses:
missing_summary_count += 1
print(f"\n [{missing_summary_count}] 修复: {report.company.short_name} - {report.title[:20]}...")
# 生成汇总
valid_results = [{
"section_name": s.section_name,
"analysis": s.summary,
"success": True
} for s in section_analyses]
company_name = report.company.short_name or report.company.company_name
summary = await ai_analyzer.summarize_analyses(valid_results, company_name, report.title)
if summary.get("success"):
final_analysis = AnalysisResult(
report_id=report.id,
analysis_type="summary",
section_name="综合分析",
ai_model=ai_analyzer.model,
summary=summary["summary"],
token_count=summary.get("tokens", 0),
is_final=True
)
db.add(final_analysis)
await db.commit()
fixed_count += 1
print(f" ✓ 已生成AI总结")
else:
print(f" ✗ 生成失败")
print(f"\n修复完成: 共 {fixed_count}/{missing_summary_count} 份报告已补充AI总结")
# 3. 统计当前数据
print("\n3. 当前数据统计:")
stmt = select(Report).where(Report.report_year >= 2024)
result = await db.execute(stmt)
current_reports = result.scalars().all()
print(f" 2024年及以后的报告: {len(current_reports)}")
if __name__ == "__main__":
asyncio.run(cleanup_and_fix())