import datetime
from apscheduler.schedulers.background import BackgroundScheduler
import os
import pdfkit
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
from urllib.parse import urljoin

class ClaieSpider(scrapy.Spider):
    name = 'claie'
    allowed_domains = ['claie.org']
    start_urls = ['https://www.claie.org/']

    path_wkhtmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe'  # 对于 Windows
    config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)


    # path_wkhtmltopdf = '/usr/local/bin/wkhtmltopdf'  # 对于 Linux
    # config = pdfkit.configuration(wkhtmltopdf=path_wkhtmltopdf)

    def parse(self, response):
        unwanted_texts = [
            '关于我们', '法律声明', '下载中心', '联系我们',
            '支持:北京大学', '中国人民大学', '中国通用航空发展协会', '中国直升机产业发展协会', '欧美同学会企业家联谊会',
            '协办:领导干部国学大讲堂 曲阜儒商会馆管理有限公司',
            '版权所有:中国低空产业经济研究院 国城弦歌(北京)文化传播中心',
            'http://claie.org', 'E-mail:907421288@qq.com',
            '首页', '政策法规', '产业动态', '产业经济', '航校培训', '生产基地',
            '维护租赁', '销售物流', '产业咨询', '产业案例', '园区规划', '应急救援',
            '医疗救援', '消防救援', '农药喷洒', '海上救援', '地震救援', '俱乐部'
        ]
        
        page_title = response.css('title::text').get()
        all_texts = response.css('body *::text').getall()
        
        page_content = ' '.join([
            text.strip() for text in all_texts
            if not any(unwanted in text for unwanted in unwanted_texts)
        ])

        self.save_page_as_pdf(response.url, page_title, page_content)

        for href in response.css('a::attr(href)').extract():
            url = urljoin(response.url, href)
            if self.allowed_domains[0] in url:
                yield scrapy.Request(url, callback=self.parse)

    def save_page_as_pdf(self, url, title, content):
        directory = 'downloaded_pages_pdf'
        if not os.path.exists(directory):
            os.makedirs(directory)
        
        html_content = f"""
        <html>
        <head>
            <meta charset="utf-8">
            <title>{title}</title>
        </head>
        <body>
            <h1>{title}</h1>
            <p><a href="{url}">{url}</a></p>
            <p>{content}</p>
        </body>
        </html>
        """

        filename = os.path.join(directory, f"{title[:50].strip().replace(' ', '_').replace('/', '_')}.pdf")
        config = pdfkit.configuration(wkhtmltopdf=self.path_wkhtmltopdf)

        pdfkit.from_string(html_content, filename, configuration=config)
        self.log(f'Saved page as PDF {filename}')

class TaskScheduler:
    def __init__(self):
        self.scheduler = BackgroundScheduler()

    def add_task(self, timedTask):
        # 使用 cron 触发器每天凌晨2点执行任务
        self.scheduler.add_job(timedTask, 'cron', hour=9, minute=46)

    def start_scheduler(self):
        self.scheduler.start()

    def stop_scheduler(self):
        self.scheduler.shutdown()

def run_spider():
    process = CrawlerProcess(get_project_settings())
    process.crawl(ClaieSpider)
    process.start()

if __name__ == "__main__":
    scheduler = TaskScheduler()
    scheduler.add_task(run_spider)  # 每天早晨10点执行一次爬虫
    scheduler.start_scheduler()
    try:
        while True:
            pass
    except KeyboardInterrupt:
        scheduler.stop_scheduler()