352 lines
14 KiB
Python
352 lines
14 KiB
Python
"""
|
||
巨潮资讯网爬虫服务 (新版API)
|
||
用于获取上市公司年报/半年报
|
||
"""
|
||
import os
|
||
import re
|
||
import asyncio
|
||
from datetime import datetime
|
||
from typing import List, Optional, Dict, Any
|
||
import httpx
|
||
from sqlalchemy.ext.asyncio import AsyncSession
|
||
from sqlalchemy import select
|
||
|
||
from app.config import settings
|
||
from app.utils.logger import logger
|
||
from app.models import Company, Report
|
||
|
||
|
||
class CninfoService:
|
||
"""巨潮资讯网服务 - 适配新版API"""
|
||
|
||
def __init__(self):
|
||
# 新版巨潮网址
|
||
self.base_url = "https://www.cninfo.com.cn"
|
||
# 新版公告查询API
|
||
self.api_url = "https://www.cninfo.com.cn/new/hisAnnouncement/query"
|
||
# 搜索API(用于获取orgId)
|
||
self.search_url = "https://www.cninfo.com.cn/new/information/topSearch/query"
|
||
|
||
self.headers = {
|
||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||
"Accept": "application/json, text/javascript, */*; q=0.01",
|
||
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
||
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
|
||
"Origin": "https://www.cninfo.com.cn",
|
||
"Referer": "https://www.cninfo.com.cn/new/disclosure",
|
||
"X-Requested-With": "XMLHttpRequest",
|
||
}
|
||
self.timeout = httpx.Timeout(30.0, connect=10.0)
|
||
|
||
def _get_plate_by_code(self, stock_code: str) -> str:
|
||
"""根据股票代码判断板块 (新版API的column参数)"""
|
||
if stock_code.startswith("6"):
|
||
return "sse" # 上交所
|
||
elif stock_code.startswith("0") or stock_code.startswith("3"):
|
||
return "szse" # 深交所
|
||
elif stock_code.startswith("8") or stock_code.startswith("4"):
|
||
return "bse" # 北交所
|
||
return "szse"
|
||
|
||
async def get_org_id(self, stock_code: str) -> Optional[str]:
|
||
"""
|
||
通过搜索API获取公司的orgId
|
||
这是精确查询公告的关键
|
||
"""
|
||
try:
|
||
search_data = {"keyWord": stock_code, "maxNum": 5}
|
||
|
||
async with httpx.AsyncClient(timeout=self.timeout, verify=False) as client:
|
||
response = await client.post(
|
||
self.search_url,
|
||
data=search_data,
|
||
headers=self.headers
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
results = response.json()
|
||
for item in results:
|
||
if item.get("code") == stock_code:
|
||
org_id = item.get("orgId")
|
||
logger.info(f"获取到 {stock_code} 的 orgId: {org_id}")
|
||
return org_id
|
||
|
||
logger.warning(f"无法获取 {stock_code} 的 orgId")
|
||
return None
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取orgId失败: {stock_code}, 错误: {e}")
|
||
return None
|
||
|
||
async def search_reports(
|
||
self,
|
||
stock_code: str,
|
||
org_id: Optional[str] = None,
|
||
report_types: Optional[List[str]] = None
|
||
) -> List[Dict[str, Any]]:
|
||
"""
|
||
搜索公司的年报/半年报
|
||
|
||
Args:
|
||
stock_code: 股票代码
|
||
org_id: 巨潮机构ID (如果为None,会自动获取)
|
||
report_types: 报告类型列表,默认为年报和半年报
|
||
|
||
Returns:
|
||
报告列表
|
||
"""
|
||
if report_types is None:
|
||
report_types = settings.REPORT_TYPES
|
||
|
||
# 如果没有orgId,先获取
|
||
if not org_id:
|
||
org_id = await self.get_org_id(stock_code)
|
||
if not org_id:
|
||
logger.error(f"无法获取 {stock_code} 的orgId,跳过")
|
||
return []
|
||
|
||
reports = []
|
||
plate = self._get_plate_by_code(stock_code)
|
||
|
||
try:
|
||
# 新版API的category参数
|
||
category_map = {
|
||
"年度报告": "category_ndbg_szsh",
|
||
"半年度报告": "category_bndbg_szsh"
|
||
}
|
||
|
||
for report_type in report_types:
|
||
category = category_map.get(report_type)
|
||
if not category:
|
||
continue
|
||
|
||
# 构建请求参数 - 关键:stock参数使用 "代码,orgId" 格式
|
||
form_data = {
|
||
"pageNum": "1",
|
||
"pageSize": "30",
|
||
"column": plate,
|
||
"tabName": "fulltext",
|
||
"plate": "",
|
||
"stock": f"{stock_code},{org_id}", # 关键格式!
|
||
"searchkey": "",
|
||
"secid": "",
|
||
"category": category,
|
||
"trade": "",
|
||
"seDate": "",
|
||
"sortName": "",
|
||
"sortType": "",
|
||
"isHLtitle": "true"
|
||
}
|
||
|
||
logger.info(f"正在查询 {stock_code} 的 {report_type}...")
|
||
|
||
async with httpx.AsyncClient(timeout=self.timeout, verify=False) as client:
|
||
response = await client.post(
|
||
self.api_url,
|
||
data=form_data,
|
||
headers=self.headers
|
||
)
|
||
|
||
if response.status_code == 200:
|
||
data = response.json()
|
||
announcements = data.get("announcements") or []
|
||
|
||
logger.info(f"找到 {len(announcements)} 条公告记录")
|
||
|
||
for ann in announcements:
|
||
# 验证是目标公司
|
||
if ann.get("secCode") != stock_code:
|
||
continue
|
||
|
||
title = ann.get("announcementTitle", "")
|
||
# 过滤摘要、更正等非正式报告
|
||
if self._is_valid_report(title, report_type):
|
||
report_year = self._extract_year(title)
|
||
|
||
# 只抓取2024年及以后的报告
|
||
if report_year and report_year < 2024:
|
||
logger.debug(f"跳过旧报告: {title} (年份: {report_year})")
|
||
continue
|
||
|
||
report_info = {
|
||
"title": title,
|
||
"report_type": report_type,
|
||
"announcement_id": ann.get("announcementId"),
|
||
"announcement_time": ann.get("announcementTime"),
|
||
"sec_code": ann.get("secCode"),
|
||
"sec_name": ann.get("secName"),
|
||
"org_id": ann.get("orgId"),
|
||
"pdf_url": self._build_pdf_url(ann.get("adjunctUrl")),
|
||
"report_year": report_year,
|
||
"report_period": self._extract_period(title, report_type)
|
||
}
|
||
reports.append(report_info)
|
||
logger.info(f"✓ 匹配报告: [{stock_code}] {title}")
|
||
else:
|
||
logger.error(f"API请求失败: {response.status_code}")
|
||
|
||
# 请求间隔2秒,避免被封禁
|
||
await asyncio.sleep(2)
|
||
|
||
except Exception as e:
|
||
logger.error(f"搜索报告失败: {stock_code}, 错误: {e}")
|
||
import traceback
|
||
logger.error(traceback.format_exc())
|
||
|
||
return reports
|
||
|
||
def _is_valid_report(self, title: str, report_type: str) -> bool:
|
||
"""判断是否为有效的正式报告(排除摘要、更正等)"""
|
||
# 排除关键词
|
||
exclude_keywords = ["摘要", "更正", "补充", "英文", "修订", "修正", "取消", "披露", "提示"]
|
||
|
||
for keyword in exclude_keywords:
|
||
if keyword in title:
|
||
return False
|
||
|
||
# 确认包含报告类型
|
||
if report_type == "年度报告":
|
||
return "年度报告" in title or "年报" in title
|
||
elif report_type == "半年度报告":
|
||
return "半年度报告" in title or "半年报" in title
|
||
|
||
return False
|
||
|
||
def _build_pdf_url(self, adjunct_url: Optional[str]) -> Optional[str]:
|
||
"""构建PDF下载URL - 使用新版静态资源域名"""
|
||
if not adjunct_url:
|
||
return None
|
||
# 如果已经是完整URL
|
||
if adjunct_url.startswith("http"):
|
||
return adjunct_url
|
||
# 新版巨潮使用 static.cninfo.com.cn 作为PDF下载域名
|
||
return f"https://static.cninfo.com.cn/{adjunct_url}"
|
||
|
||
def _extract_year(self, title: str) -> Optional[int]:
|
||
"""从标题中提取年份"""
|
||
match = re.search(r"(\d{4})\s*年", title)
|
||
if match:
|
||
return int(match.group(1))
|
||
return None
|
||
|
||
def _extract_period(self, title: str, report_type: str) -> str:
|
||
"""提取报告期间"""
|
||
if report_type == "年度报告":
|
||
return "年报"
|
||
elif report_type == "半年度报告":
|
||
return "半年报"
|
||
return ""
|
||
|
||
async def download_pdf(self, pdf_url: str, save_path: str) -> bool:
|
||
"""
|
||
下载PDF文件
|
||
"""
|
||
try:
|
||
# 确保目录存在
|
||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||
|
||
logger.info(f"开始下载: {pdf_url}")
|
||
|
||
async with httpx.AsyncClient(timeout=httpx.Timeout(180.0), verify=False) as client:
|
||
response = await client.get(pdf_url, headers=self.headers, follow_redirects=True)
|
||
|
||
if response.status_code == 200:
|
||
with open(save_path, "wb") as f:
|
||
f.write(response.content)
|
||
|
||
file_size = os.path.getsize(save_path)
|
||
logger.info(f"✓ 下载成功: {save_path}, 大小: {file_size / 1024 / 1024:.2f} MB")
|
||
return True
|
||
else:
|
||
logger.error(f"下载失败: {pdf_url}, 状态码: {response.status_code}")
|
||
return False
|
||
|
||
except Exception as e:
|
||
logger.error(f"下载异常: {pdf_url}, 错误: {e}")
|
||
return False
|
||
|
||
async def sync_company_reports(
|
||
self,
|
||
db: AsyncSession,
|
||
company: Company,
|
||
force_download: bool = False
|
||
) -> int:
|
||
"""
|
||
同步公司的报告
|
||
"""
|
||
logger.info(f"========== 开始同步: {company.stock_code} {company.short_name} ==========")
|
||
|
||
# 获取或更新orgId
|
||
org_id = company.org_id
|
||
if not org_id:
|
||
org_id = await self.get_org_id(company.stock_code)
|
||
if org_id:
|
||
company.org_id = org_id
|
||
await db.commit()
|
||
|
||
# 搜索报告
|
||
reports = await self.search_reports(company.stock_code, org_id)
|
||
|
||
if not reports:
|
||
logger.warning(f"未找到任何报告: {company.stock_code}")
|
||
return 0
|
||
|
||
new_count = 0
|
||
|
||
for report_info in reports:
|
||
# 检查是否已存在
|
||
stmt = select(Report).where(
|
||
Report.announcement_id == report_info["announcement_id"]
|
||
)
|
||
result = await db.execute(stmt)
|
||
existing = result.scalar_one_or_none()
|
||
|
||
if existing:
|
||
if not force_download:
|
||
logger.debug(f"报告已存在: {report_info['title']}")
|
||
continue
|
||
|
||
# 创建新报告记录
|
||
report = Report(
|
||
company_id=company.id,
|
||
title=report_info["title"],
|
||
report_type=report_info["report_type"],
|
||
report_year=report_info["report_year"],
|
||
report_period=report_info["report_period"],
|
||
announcement_id=report_info["announcement_id"],
|
||
announcement_time=datetime.fromtimestamp(
|
||
report_info["announcement_time"] / 1000
|
||
) if report_info["announcement_time"] else None,
|
||
pdf_url=report_info["pdf_url"]
|
||
)
|
||
|
||
# 构建本地保存路径
|
||
year = report_info["report_year"] or "unknown"
|
||
period = report_info["report_period"] or "report"
|
||
filename = f"{company.stock_code}_{year}_{period}.pdf"
|
||
save_path = os.path.join(settings.PDF_DIR, company.stock_code, filename)
|
||
|
||
# 下载PDF
|
||
if report_info["pdf_url"]:
|
||
success = await self.download_pdf(report_info["pdf_url"], save_path)
|
||
if success:
|
||
report.is_downloaded = True
|
||
report.local_path = save_path
|
||
report.file_size = os.path.getsize(save_path)
|
||
|
||
if not existing:
|
||
db.add(report)
|
||
new_count += 1
|
||
|
||
await db.commit()
|
||
|
||
# 避免请求过快
|
||
await asyncio.sleep(0.5)
|
||
|
||
logger.info(f"========== 同步完成: {company.stock_code}, 新增 {new_count} 份报告 ==========")
|
||
return new_count
|
||
|
||
|
||
# 创建全局服务实例
|
||
cninfo_service = CninfoService()
|