104 lines
3.9 KiB
Python
104 lines
3.9 KiB
Python
|
|
"""
|
|||
|
|
清理旧数据 + 补充缺失的AI总结
|
|||
|
|
"""
|
|||
|
|
import asyncio
|
|||
|
|
import sys
|
|||
|
|
sys.path.insert(0, ".")
|
|||
|
|
|
|||
|
|
from app.database import AsyncSessionLocal, init_db
|
|||
|
|
from app.models import Report, AnalysisResult, ExtractedContent
|
|||
|
|
from app.services.ai_analyzer import ai_analyzer
|
|||
|
|
from sqlalchemy import select, delete
|
|||
|
|
from sqlalchemy.orm import selectinload
|
|||
|
|
|
|||
|
|
async def cleanup_and_fix():
|
|||
|
|
print("=" * 60)
|
|||
|
|
print("数据清理和修复")
|
|||
|
|
print("=" * 60)
|
|||
|
|
|
|||
|
|
await init_db()
|
|||
|
|
|
|||
|
|
async with AsyncSessionLocal() as db:
|
|||
|
|
# 1. 删除2024年之前的报告(保留2024和2025年)
|
|||
|
|
print("\n1. 清理2024年之前的旧数据...")
|
|||
|
|
|
|||
|
|
stmt = select(Report).where(Report.report_year < 2024)
|
|||
|
|
result = await db.execute(stmt)
|
|||
|
|
old_reports = result.scalars().all()
|
|||
|
|
|
|||
|
|
print(f" 找到 {len(old_reports)} 份2024年前的报告")
|
|||
|
|
|
|||
|
|
for report in old_reports:
|
|||
|
|
# 删除关联数据
|
|||
|
|
await db.execute(delete(ExtractedContent).where(ExtractedContent.report_id == report.id))
|
|||
|
|
await db.execute(delete(AnalysisResult).where(AnalysisResult.report_id == report.id))
|
|||
|
|
await db.delete(report)
|
|||
|
|
|
|||
|
|
await db.commit()
|
|||
|
|
print(f" ✓ 已删除 {len(old_reports)} 份旧报告")
|
|||
|
|
|
|||
|
|
# 2. 为缺失AI总结的报告补充汇总
|
|||
|
|
print("\n2. 为缺失AI总结的报告生成汇总...")
|
|||
|
|
|
|||
|
|
stmt = select(Report).where(
|
|||
|
|
Report.is_extracted == True,
|
|||
|
|
Report.report_year >= 2024
|
|||
|
|
).options(
|
|||
|
|
selectinload(Report.company),
|
|||
|
|
selectinload(Report.analysis_results)
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
result = await db.execute(stmt)
|
|||
|
|
reports = result.scalars().all()
|
|||
|
|
|
|||
|
|
missing_summary_count = 0
|
|||
|
|
fixed_count = 0
|
|||
|
|
|
|||
|
|
for report in reports:
|
|||
|
|
# 检查是否缺少summary
|
|||
|
|
has_summary = any(r.analysis_type == "summary" for r in report.analysis_results)
|
|||
|
|
section_analyses = [r for r in report.analysis_results if r.analysis_type == "section"]
|
|||
|
|
|
|||
|
|
if not has_summary and section_analyses:
|
|||
|
|
missing_summary_count += 1
|
|||
|
|
print(f"\n [{missing_summary_count}] 修复: {report.company.short_name} - {report.title[:20]}...")
|
|||
|
|
|
|||
|
|
# 生成汇总
|
|||
|
|
valid_results = [{
|
|||
|
|
"section_name": s.section_name,
|
|||
|
|
"analysis": s.summary,
|
|||
|
|
"success": True
|
|||
|
|
} for s in section_analyses]
|
|||
|
|
|
|||
|
|
company_name = report.company.short_name or report.company.company_name
|
|||
|
|
summary = await ai_analyzer.summarize_analyses(valid_results, company_name, report.title)
|
|||
|
|
|
|||
|
|
if summary.get("success"):
|
|||
|
|
final_analysis = AnalysisResult(
|
|||
|
|
report_id=report.id,
|
|||
|
|
analysis_type="summary",
|
|||
|
|
section_name="综合分析",
|
|||
|
|
ai_model=ai_analyzer.model,
|
|||
|
|
summary=summary["summary"],
|
|||
|
|
token_count=summary.get("tokens", 0),
|
|||
|
|
is_final=True
|
|||
|
|
)
|
|||
|
|
db.add(final_analysis)
|
|||
|
|
await db.commit()
|
|||
|
|
fixed_count += 1
|
|||
|
|
print(f" ✓ 已生成AI总结")
|
|||
|
|
else:
|
|||
|
|
print(f" ✗ 生成失败")
|
|||
|
|
|
|||
|
|
print(f"\n修复完成: 共 {fixed_count}/{missing_summary_count} 份报告已补充AI总结")
|
|||
|
|
|
|||
|
|
# 3. 统计当前数据
|
|||
|
|
print("\n3. 当前数据统计:")
|
|||
|
|
stmt = select(Report).where(Report.report_year >= 2024)
|
|||
|
|
result = await db.execute(stmt)
|
|||
|
|
current_reports = result.scalars().all()
|
|||
|
|
print(f" 2024年及以后的报告: {len(current_reports)} 份")
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
asyncio.run(cleanup_and_fix())
|