import datetime from apscheduler.schedulers.background import BackgroundScheduler import os import pdfkit import scrapy from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings from urllib.parse import urljoin class ClaieSpider(scrapy.Spider): name = 'claie' allowed_domains = ['claie.org'] start_urls = ['https://www.claie.org/'] path_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe' # 对于 Windows config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf) # path_wkhtmltopdf = '/usr/local/bin/wkhtmltopdf' # 对于 Linux # config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf) def parse(self, response): unwanted_texts = [ '关于我们', '法律声明', '下载中心', '联系我们', '支持:北京大学', '中国人民大学', '中国通用航空发展协会', '中国直升机产业发展协会', '欧美同学会企业家联谊会', '协办:领导干部国学大讲堂 曲阜儒商会馆管理有限公司', '版权所有:中国低空产业经济研究院 国城弦歌(北京)文化传播中心', 'http://claie.org', 'E-mail:907421288@qq.com', '首页', '政策法规', '产业动态', '产业经济', '航校培训', '生产基地', '维护租赁', '销售物流', '产业咨询', '产业案例', '园区规划', '应急救援', '医疗救援', '消防救援', '农药喷洒', '海上救援', '地震救援', '俱乐部' ] page_title = response.css('title::text').get() all_texts = response.css('body *::text').getall() page_content = ' '.join([ text.strip() for text in all_texts if not any(unwanted in text for unwanted in unwanted_texts) ]) self.save_page_as_pdf(response.url, page_title, page_content) for href in response.css('a::attr(href)').extract(): url = urljoin(response.url, href) if self.allowed_domains[0] in url: yield scrapy.Request(url, callback=self.parse) def save_page_as_pdf(self, url, title, content): directory = 'downloaded_pages_pdf' if not os.path.exists(directory): os.makedirs(directory) html_content = f""" <html> <head> <meta charset="utf-8"> <title>{title}</title> </head> <body> <h1>{title}</h1> <p><a href="{url}">{url}</a></p> <p>{content}</p> </body> </html> """ filename = os.path.join(directory, f"{title[:50].strip().replace(' ', '_').replace('/', '_')}.pdf") config = pdfkit.configuration(wkhtmltopdf=self.path_wkhtmltopdf) pdfkit.from_string(html_content, filename, configuration=config) self.log(f'Saved page as PDF {filename}') class TaskScheduler: def __init__(self): self.scheduler = BackgroundScheduler() def add_task(self, timedTask): # 使用 cron 触发器每天凌晨2点执行任务 self.scheduler.add_job(timedTask, 'cron', hour=9, minute=46) def start_scheduler(self): self.scheduler.start() def stop_scheduler(self): self.scheduler.shutdown() def run_spider(): process = CrawlerProcess(get_project_settings()) process.crawl(ClaieSpider) process.start() if __name__ == "__main__": scheduler = TaskScheduler() scheduler.add_task(run_spider) # 每天早晨10点执行一次爬虫 scheduler.start_scheduler() try: while True: pass except KeyboardInterrupt: scheduler.stop_scheduler()