145 lines
5.6 KiB
Python
145 lines
5.6 KiB
Python
|
|
"""
|
|||
|
|
PDF内容提取服务 - 增强版
|
|||
|
|
提取更多内容,更深度的分析
|
|||
|
|
"""
|
|||
|
|
import re
|
|||
|
|
from typing import List, Dict, Optional, Tuple
|
|||
|
|
import fitz
|
|||
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
|
from app.config import settings
|
|||
|
|
from app.utils.logger import logger
|
|||
|
|
from app.models import Report, ExtractedContent
|
|||
|
|
|
|||
|
|
|
|||
|
|
class PDFExtractor:
|
|||
|
|
"""PDF内容提取器 - 增强版"""
|
|||
|
|
|
|||
|
|
def __init__(self):
|
|||
|
|
self.keywords = settings.EXTRACT_KEYWORDS
|
|||
|
|
# 增加更多提取行数(上下文更丰富)
|
|||
|
|
self.context_lines_before = 5
|
|||
|
|
self.context_lines_after = 80 # 增加到80行,获取更完整的章节内容
|
|||
|
|
self.min_content_length = 200 # 最小内容长度
|
|||
|
|
|
|||
|
|
def extract_text_from_pdf(self, pdf_path: str) -> Tuple[str, int]:
|
|||
|
|
"""从PDF提取全部文本"""
|
|||
|
|
try:
|
|||
|
|
doc = fitz.open(pdf_path)
|
|||
|
|
text = ""
|
|||
|
|
page_count = doc.page_count
|
|||
|
|
for page in doc:
|
|||
|
|
text += page.get_text()
|
|||
|
|
doc.close()
|
|||
|
|
logger.info(f"PDF提取完成: {page_count} 页, {len(text)} 字符")
|
|||
|
|
return text, page_count
|
|||
|
|
except Exception as e:
|
|||
|
|
logger.error(f"PDF提取失败: {e}")
|
|||
|
|
return "", 0
|
|||
|
|
|
|||
|
|
def extract_sections_by_keywords(self, text: str) -> List[Dict]:
|
|||
|
|
"""根据关键词提取相关章节 - 增强版,提取更多内容"""
|
|||
|
|
sections = []
|
|||
|
|
lines = text.split('\n')
|
|||
|
|
total_lines = len(lines)
|
|||
|
|
|
|||
|
|
# 记录已匹配的行范围,避免重复提取
|
|||
|
|
matched_ranges = []
|
|||
|
|
|
|||
|
|
for keyword in self.keywords:
|
|||
|
|
for i, line in enumerate(lines):
|
|||
|
|
if keyword in line:
|
|||
|
|
# 检查是否在已匹配范围内
|
|||
|
|
is_overlapping = False
|
|||
|
|
for start_r, end_r in matched_ranges:
|
|||
|
|
if start_r <= i <= end_r:
|
|||
|
|
is_overlapping = True
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
if is_overlapping:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 扩大提取范围
|
|||
|
|
start = max(0, i - self.context_lines_before)
|
|||
|
|
end = min(total_lines, i + self.context_lines_after)
|
|||
|
|
|
|||
|
|
# 智能查找章节结束位置(遇到下一个大标题时停止)
|
|||
|
|
for j in range(i + 10, end):
|
|||
|
|
next_line = lines[j].strip()
|
|||
|
|
# 如果遇到新的章节标题(如"第X节"、"(X)"等),停止
|
|||
|
|
if re.match(r'^[((]?[一二三四五六七八九十\d]+[))]', next_line):
|
|||
|
|
if len(next_line) < 50: # 确保是标题而非正文
|
|||
|
|
end = j
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
content_lines = [l.strip() for l in lines[start:end] if l.strip()]
|
|||
|
|
content = '\n'.join(content_lines)
|
|||
|
|
|
|||
|
|
if len(content) > self.min_content_length:
|
|||
|
|
sections.append({
|
|||
|
|
"keyword": keyword,
|
|||
|
|
"section_name": line.strip()[:100],
|
|||
|
|
"content": content,
|
|||
|
|
"char_count": len(content),
|
|||
|
|
"line_start": start,
|
|||
|
|
"line_end": end
|
|||
|
|
})
|
|||
|
|
matched_ranges.append((start, end))
|
|||
|
|
|
|||
|
|
dedupe_sections = self._dedupe_sections(sections)
|
|||
|
|
logger.info(f"提取到 {len(dedupe_sections)} 个章节")
|
|||
|
|
return dedupe_sections
|
|||
|
|
|
|||
|
|
def _dedupe_sections(self, sections: List[Dict]) -> List[Dict]:
|
|||
|
|
"""去重并按内容丰富度排序"""
|
|||
|
|
seen = set()
|
|||
|
|
result = []
|
|||
|
|
for s in sections:
|
|||
|
|
# 使用前300字符作为去重依据
|
|||
|
|
key = s["content"][:300]
|
|||
|
|
if key not in seen:
|
|||
|
|
seen.add(key)
|
|||
|
|
result.append(s)
|
|||
|
|
|
|||
|
|
# 按内容长度排序,优先保留内容更丰富的章节
|
|||
|
|
result.sort(key=lambda x: x["char_count"], reverse=True)
|
|||
|
|
return result
|
|||
|
|
|
|||
|
|
async def extract_and_save(self, db: AsyncSession, report: Report) -> List[ExtractedContent]:
|
|||
|
|
"""提取并保存到数据库"""
|
|||
|
|
import asyncio
|
|||
|
|
|
|||
|
|
if not report.local_path:
|
|||
|
|
logger.warning(f"报告无本地路径: {report.title}")
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# 将 PDF 提取这种阻塞 CPU 的任务放入线程池
|
|||
|
|
logger.info(f"开启线程提取 PDF: {report.title}")
|
|||
|
|
text, page_count = await asyncio.to_thread(self.extract_text_from_pdf, report.local_path)
|
|||
|
|
if not text:
|
|||
|
|
return []
|
|||
|
|
|
|||
|
|
# 将关键词匹配这种阻塞 CPU 的任务也放入线程池
|
|||
|
|
logger.info(f"开启线程提取章节: {report.title}")
|
|||
|
|
sections = await asyncio.to_thread(self.extract_sections_by_keywords, text)
|
|||
|
|
|
|||
|
|
contents = []
|
|||
|
|
for section in sections:
|
|||
|
|
content = ExtractedContent(
|
|||
|
|
report_id=report.id,
|
|||
|
|
section_name=section["section_name"],
|
|||
|
|
section_keyword=section["keyword"],
|
|||
|
|
content=section["content"],
|
|||
|
|
char_count=section["char_count"]
|
|||
|
|
)
|
|||
|
|
db.add(content)
|
|||
|
|
contents.append(content)
|
|||
|
|
|
|||
|
|
report.is_extracted = True
|
|||
|
|
await db.commit()
|
|||
|
|
|
|||
|
|
logger.info(f"保存 {len(contents)} 个章节到数据库")
|
|||
|
|
return contents
|
|||
|
|
|
|||
|
|
|
|||
|
|
pdf_extractor = PDFExtractor()
|