LLM应用实战-财经新闻自动聚合

1.
背景

这段时间项目比较忙，所以本
qiang~
有些耽误了学习，不过也算是百忙之中，抽取时间来支撑一个读者的需求，即爬取一些财经网站的新闻并自动聚合。

该读者看了之前的《
AI
资讯的自动聚合及报告生成
》文章后，想要将这一套流程嵌套在财经领域，因此满打满算耗费了
2-3
天时间，来完成了该需求。

注意：爬虫不是本人的强项，只是一丢丢兴趣而已
;
其次，本篇文章主要是用于个人学习，客官们请勿直接商业使用。

2.
面临的难点

1.
爬虫框架选取
:
采用之前现学现用的
crawl4ai
作为基础框架，使用其高阶技能来逼近模拟人访问浏览器，因为网站都存在反爬机制，如鉴权、
cookie
等；

2.
外网新闻
:
需要kexue上网；

3.
新闻内容解析
:
此处耗费的工作量最多，并不是
html
的页面解析有多难，主要是动态页面加载如何集成
crawl4ai
来实现，且每个新闻网站五花八门。

3.
数据源

数据源	url	备注
财 lian 社	https://www.cls.cn/depth?id=1000 https://www.cls.cn/depth?id=1003 https://www.cls.cn/depth?id=100 7	1000: 头条 , 1003: A 股 , 1007: 环球
凤 huang 网	https://finance.ifeng.com/shanklist/1-64-/
新 lang	https://finance.sina.com.cn/roll/#pageid=384&lid=2519&k=&num=50&page=1 https://finance.sina.com.cn/roll/#pageid=384&lid=2672&k=&num=50&page=1	2519: 财经 2672 : 美股
环 qiu 时报	https://finance.huanqiu.com
zaobao	https://www.zaobao.com/finance/china https://www.zaobao.com/finance/world	国内及世界
fox	https://www.foxnews.com/category/us/economy https://www.foxnews.com//world/global-economy	美国及世界
cnn	https://edition.cnn.com/business https://edition.cnn.com/business/china	国内及世界
reuters	https://www.reuters.com/business

4.
部分源码

为了减少风险，本
qiang~
只列出财
lian
社网页的解析代码，读者如想进一步交流沟通，可私信联系。

代码片段解析
:

1. schema
是以
json
格式叠加
css
样式的策略，
crawl4ai
基于
schema
可以实现特定元素的结构化解析

2. js_commands
是
js
代码，主要用于模拟浏览新闻时的下翻页

importasynciofrom crawl4ai importAsyncWebCrawlerfrom crawl4ai.extraction_strategy importJsonCssExtractionStrategyimportjsonfrom typing importDict, Any, Union, Listimportosimportdatetimeimportreimporthashlibdefmd5(text):

    m=hashlib.md5()

    m.update(text.encode('utf-8'))returnm.hexdigest()def get_datas(file_path, json_flag=True, all_flag=False, mode='r'):"""读取文本文件"""results=[]

    

    with open(file_path, mode, encoding='utf-8') as f:for line inf.readlines():ifjson_flag:

                results.append(json.loads(line))else:

                results.append(line.strip())ifall_flag:ifjson_flag:return json.loads(''.join(results))else:return '\n'.join(results)returnresultsdef save_datas(file_path, datas, json_flag=True, all_flag=False, with_indent=False, mode='w'):"""保存文本文件"""with open(file_path, mode, encoding='utf-8') as f:ifall_flag:ifjson_flag:

                f.write(json.dumps(datas, ensure_ascii=False, indent= 4 if with_indent elseNone))else:

                f.write(''.join(datas))else:for data indatas:ifjson_flag:

                    f.write(json.dumps(data, ensure_ascii=False) + '\n')else:

                    f.write(data+ '\n')classAbstractAICrawler():def __init__(self) ->None:pass
    defcrawl():raiseNotImplementedError()classAINewsCrawler(AbstractAICrawler):def __init__(self, domain) ->None:

        super().__init__()

        self.domain=domain

        self.file_path= f'data/{self.domain}.json'self.history=self.init()definit(self):if notos.path.exists(self.file_path):return{}return {ele['id']: ele for ele inget_datas(self.file_path)}defsave(self, datas: Union[List, Dict]):ifisinstance(datas, dict):

            datas=[datas]

        self.history.update({ele['id']: ele for ele indatas})

        save_datas(self.file_path, datas=list(self.history.values()))

    

    asyncdefcrawl(self, url:str, 

                    schema: Dict[str, Any]=None, 

                    always_by_pass_cache=True, 

                    bypass_cache=True,

                    headless=True,

                    verbose=False,

                    magic=True,

                    page_timeout=15000,

                    delay_before_return_html=2.0,

                    wait_for='',

                    js_code=None,

                    js_only=False,

                    screenshot=False,

                    headers={}):

        

        extraction_strategy= JsonCssExtractionStrategy(schema, verbose=verbose) if schema elseNone

        

        async with AsyncWebCrawler(verbose=verbose, 

                                   headless=headless, 

                                   always_by_pass_cache=always_by_pass_cache, headers=headers) as crawler:

            result=await crawler.arun(

                url=url,

                extraction_strategy=extraction_strategy,

                bypass_cache=bypass_cache,

                page_timeout=page_timeout,

                delay_before_return_html=delay_before_return_html,

                wait_for=wait_for,

                js_code=js_code,

                magic=magic,

                remove_overlay_elements=True,

                process_iframes=True,

                exclude_external_links=True,

                js_only=js_only,

                screenshot=screenshot

            )assert result.success, "Failed to crawl the page"
            ifschema:

                res=json.loads(result.extracted_content)ifscreenshot:returnres, result.screenshotreturnresreturnresult.htmlclassFinanceNewsCrawler(AINewsCrawler):def __init__(self, domain='') ->None:

        super().__init__(domain)defsave(self, datas: Union[List, Dict]):ifisinstance(datas, dict):

            datas=[datas]

        self.history.update({ele['id']: ele for ele indatas})

        save_datas(self.file_path, datas=datas, mode='a')

    

    asyncdefget_last_day_data(self):

        last_day= (datetime.date.today() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')

        datas=self.init()return [v for v in datas.values() if last_day in v['date']]classCLSCrawler(FinanceNewsCrawler):"""财某社新闻抓取"""
    def __init__(self) ->None:

        self.domain= 'cls'super().__init__(self.domain)

        self.url= 'https://www.cls.cn'asyncdef crawl_url_list(self, url='https://www.cls.cn/depth?id=1000'):

        schema={'name': 'caijingwang toutiao page crawler','baseSelector': 'div.f-l.content-left','fields': [

                {'name': 'top_titles','selector': 'div.depth-top-article-list','type': 'nested_list','fields': [

                        {'name': 'href', 'type': 'attribute', 'attribute':'href', 'selector': 'a[href]'}

                    ]

                },

                {'name': 'sec_titles','selector': 'div.depth-top-article-list  li.f-l','type': 'nested_list','fields': [

                        {'name': 'href', 'type': 'attribute', 'attribute':'href', 'selector': 'a[href]'}

                    ]

                },

                {'name': 'bottom_titles','selector': 'div.b-t-1 div.clearfix','type': 'nested_list','fields': [

                        {'name': 'href', 'type': 'attribute', 'attribute':'href', 'selector': 'a[href]'}

                    ]

                }

            ]

        }

        

        js_commands=["""(async () => {{

                

                await new Promise(resolve => setTimeout(resolve, 500));

                

                const targetItemCount = 100;

                

                let currentItemCount = document.querySelectorAll('div.b-t-1 div.clearfix a.f-w-b').length;

                let loadMoreButton = document.querySelector('.list-more-button.more-button');

                

                while (currentItemCount < targetItemCount) {{

                    window.scrollTo(0, document.body.scrollHeight);

                    

                    await new Promise(resolve => setTimeout(resolve, 1000));

                    

                    if (loadMoreButton) {

                        loadMoreButton.click();

                    } else {

                        console.log('没有找到加载更多按钮');

                        break;

                    }

                    

                    await new Promise(resolve => setTimeout(resolve, 1000));

                    

                    currentItemCount = document.querySelectorAll('div.b-t-1 div.clearfix a.f-w-b').length;

                    

                    loadMoreButton = document.querySelector('.list-more-button.more-button');

                }}

                console.log(`已加载 ${currentItemCount} 个item`);

                return currentItemCount;

            }})();"""]

        wait_for= ''results={}

        

        menu_dict={'1000': '头条','1003': 'A股','1007': '环球'}for k, v inmenu_dict.items():

            url= f'https://www.cls.cn/depth?id={k}'
            try:

                links= await super().crawl(url, schema, always_by_pass_cache=True, bypass_cache=True, js_code=js_commands, wait_for=wait_for, js_only=False)exceptException as e:print(f'error {url}')

                links=[]iflinks:

                links= [ele['href'] for eles in links[0].values() for ele in eles if 'href' inele]

            links= sorted(list(set(links)), key=lambdax: x)

            results.update({f'{self.url}{ele}': v for ele inlinks})returnresults

    

    asyncdefcrawl_newsletter(self, url, category):

        schema={'name': '财联社新闻详情页','baseSelector': 'div.f-l.content-left','fields': [

                {'name': 'title','selector': 'span.detail-title-content','type': 'text'},

                {'name': 'time','selector': 'div.m-r-10','type': 'text'},

                {'name': 'abstract','selector': 'pre.detail-brief','type': 'text','fields': [

                        {'name': 'href', 'type': 'attribute', 'attribute':'href', 'selector': 'a[href]'}

                    ]

                },

                {'name': 'contents','selector': 'div.detail-content p','type': 'list','fields': [

                        {'name': 'content', 'type': 'text'}

                    ]

                },

                {'name': 'read_number','selector': 'div.detail-option-readnumber','type': 'text'}

            ]

        }

        

        wait_for= 'div.detail-content'
        try:

            results= await super().crawl(url, schema, always_by_pass_cache=True, bypass_cache=True, wait_for=wait_for)

            result=results[0]exceptException as e:print(f'crawler error: {url}')return{}return{'title': result['title'],'abstract': result['abstract'],'date': result['time'],'link': url,'content': '\n'.join([ele['content'] for ele in result['contents'] if 'content' in ele and ele['content']]),'id': md5(url),'type': category,'read_number': await self.get_first_float_number(result['read_number'], r'[-+]?\d*\.\d+|\d+'),'time': datetime.datetime.now().strftime('%Y-%m-%d')

        }

    

    asyncdefget_first_float_number(self, text, pattern):

        match=re.search(pattern, text)ifmatch:return round(float(match.group()), 4)return0

    

    asyncdefcrawl(self):

        link_2_category=await self.crawl_url_list()for link, category inlink_2_category.items():

            _id=md5(link)if _id inself.history:continuenews=await self.crawl_newsletter(link, category)ifnews:

                self.save(news)returnawait self.get_last_day_data()if __name__ == '__main__':

    asyncio.run(CLSCrawler().crawl())

5.
总结

一句话足矣
~

开发了一款新闻资讯的自动聚合的工具，基于
crawl4ai
框架实现。

有问题可以私信或留言沟通！

6.
参考

(1)
Crawl4ai:
https://github.com/unclecode/crawl4ai

LLM应用实战-财经新闻自动聚合

1.
背景

2.
面临的难点

3.
数据源

4.
部分源码

6.
参考

添加新评论

最新文章

最近回复

分类

归档

其它

LLM应用实战-财经新闻自动聚合

1. 背景

2. 面临的难点

3. 数据源

4. 部分源码

6. 参考

添加新评论

最新文章

最近回复

分类

归档

其它

1.
背景

2.
面临的难点

3.
数据源

4.
部分源码

6.
参考