huibao/backend/cleanup_old_data.py

104 lines
3.9 KiB
Python
Raw Permalink Normal View History

"""
清理旧数据 + 补充缺失的AI总结
"""
import asyncio
import sys
sys.path.insert(0, ".")
from app.database import AsyncSessionLocal, init_db
from app.models import Report, AnalysisResult, ExtractedContent
from app.services.ai_analyzer import ai_analyzer
from sqlalchemy import select, delete
from sqlalchemy.orm import selectinload
async def cleanup_and_fix():
print("=" * 60)
print("数据清理和修复")
print("=" * 60)
await init_db()
async with AsyncSessionLocal() as db:
# 1. 删除2024年之前的报告保留2024和2025年
print("\n1. 清理2024年之前的旧数据...")
stmt = select(Report).where(Report.report_year < 2024)
result = await db.execute(stmt)
old_reports = result.scalars().all()
print(f" 找到 {len(old_reports)} 份2024年前的报告")
for report in old_reports:
# 删除关联数据
await db.execute(delete(ExtractedContent).where(ExtractedContent.report_id == report.id))
await db.execute(delete(AnalysisResult).where(AnalysisResult.report_id == report.id))
await db.delete(report)
await db.commit()
print(f" ✓ 已删除 {len(old_reports)} 份旧报告")
# 2. 为缺失AI总结的报告补充汇总
print("\n2. 为缺失AI总结的报告生成汇总...")
stmt = select(Report).where(
Report.is_extracted == True,
Report.report_year >= 2024
).options(
selectinload(Report.company),
selectinload(Report.analysis_results)
)
result = await db.execute(stmt)
reports = result.scalars().all()
missing_summary_count = 0
fixed_count = 0
for report in reports:
# 检查是否缺少summary
has_summary = any(r.analysis_type == "summary" for r in report.analysis_results)
section_analyses = [r for r in report.analysis_results if r.analysis_type == "section"]
if not has_summary and section_analyses:
missing_summary_count += 1
print(f"\n [{missing_summary_count}] 修复: {report.company.short_name} - {report.title[:20]}...")
# 生成汇总
valid_results = [{
"section_name": s.section_name,
"analysis": s.summary,
"success": True
} for s in section_analyses]
company_name = report.company.short_name or report.company.company_name
summary = await ai_analyzer.summarize_analyses(valid_results, company_name, report.title)
if summary.get("success"):
final_analysis = AnalysisResult(
report_id=report.id,
analysis_type="summary",
section_name="综合分析",
ai_model=ai_analyzer.model,
summary=summary["summary"],
token_count=summary.get("tokens", 0),
is_final=True
)
db.add(final_analysis)
await db.commit()
fixed_count += 1
print(f" ✓ 已生成AI总结")
else:
print(f" ✗ 生成失败")
print(f"\n修复完成: 共 {fixed_count}/{missing_summary_count} 份报告已补充AI总结")
# 3. 统计当前数据
print("\n3. 当前数据统计:")
stmt = select(Report).where(Report.report_year >= 2024)
result = await db.execute(stmt)
current_reports = result.scalars().all()
print(f" 2024年及以后的报告: {len(current_reports)}")
if __name__ == "__main__":
asyncio.run(cleanup_and_fix())