huibao/backend/process_reports.py

"""
处理已下载的报告：提取内容 + AI分析
"""
import asyncio
import sys
sys.path.insert(0, ".")

from app.database import AsyncSessionLocal, init_db
from app.models import Report
from app.services.pdf_extractor import pdf_extractor
from app.services.ai_analyzer import ai_analyzer
from sqlalchemy import select
from sqlalchemy.orm import selectinload

async def process_reports():
    print("=" * 60)
    print("处理已下载的报告：提取内容 + AI分析")
    print("=" * 60)
    
    await init_db()
    
    async with AsyncSessionLocal() as db:
        # 查找已下载但未提取的报告
        stmt = select(Report).where(
            Report.is_downloaded == True,
            Report.is_extracted == False
        ).options(selectinload(Report.company))
        
        result = await db.execute(stmt)
        reports = result.scalars().all()
        
        print(f"\n找到 {len(reports)} 份待处理的报告")
        
        for i, report in enumerate(reports, 1):
            company_name = report.company.short_name if report.company else "未知"
            print(f"\n[{i}/{len(reports)}] 处理: {company_name} - {report.title[:30]}...")
            
            try:
                # 1. 提取PDF内容
                print("   → 提取PDF内容...")
                contents = await pdf_extractor.extract_and_save(db, report)
                print(f"   ✓ 提取了 {len(contents)} 个章节")
                
                # 2. AI分析（如果有提取内容）
                if contents:
                    print("   → 执行AI分析...")
                    success = await ai_analyzer.analyze_report(db, report)
                    if success:
                        print("   ✓ AI分析完成")
                    else:
                        print("   ✗ AI分析失败")
                else:
                    print("   ⚠ 无提取内容，跳过AI分析")
                    
            except Exception as e:
                print(f"   ✗ 处理失败: {e}")
        
        print("\n" + "=" * 60)
        print("处理完成！")
        print("=" * 60)

if __name__ == "__main__":
    asyncio.run(process_reports())