""" 检查PDF下载链接格式 - 详细版 """ import asyncio import httpx async def check_pdf_url(): print("=" * 60) print("检查PDF下载链接格式") print("=" * 60) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", "Accept": "application/json, text/javascript, */*; q=0.01", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Origin": "https://www.cninfo.com.cn", "Referer": "https://www.cninfo.com.cn/new/disclosure", } # 先获取orgId search_url = "https://www.cninfo.com.cn/new/information/topSearch/query" search_data = {"keyWord": "300622", "maxNum": 5} org_id = None async with httpx.AsyncClient(timeout=30.0, verify=False) as client: response = await client.post(search_url, data=search_data, headers=headers) if response.status_code == 200: results = response.json() for item in results: if item.get("code") == "300622": org_id = item.get("orgId") break print(f"orgId: {org_id}") # 查询公告 api_url = "https://www.cninfo.com.cn/new/hisAnnouncement/query" form_data = { "pageNum": "1", "pageSize": "5", "column": "szse", "tabName": "fulltext", "stock": f"300622,{org_id}", "category": "category_ndbg_szsh", "seDate": "", } async with httpx.AsyncClient(timeout=30.0, verify=False) as client: response = await client.post(api_url, data=form_data, headers=headers) if response.status_code == 200: data = response.json() announcements = data.get("announcements", []) print(f"\n找到 {len(announcements)} 条公告") for ann in announcements[:2]: title = ann.get("announcementTitle", "") adjunct_url = ann.get("adjunctUrl", "") print(f"\n标题: {title}") print(f"adjunctUrl原始值: {adjunct_url}") if adjunct_url: # 新版下载链接应该使用 static.cninfo.com.cn new_url = f"https://static.cninfo.com.cn/{adjunct_url}" print(f"新版链接: {new_url}") # 测试下载 print("测试下载...") try: dl_response = await client.head(new_url, follow_redirects=True) print(f"状态码: {dl_response.status_code}") content_type = dl_response.headers.get("content-type", "") print(f"Content-Type: {content_type}") except Exception as e: print(f"测试失败: {e}") if __name__ == "__main__": asyncio.run(check_pdf_url())