diff --git a/engines/crawler_bbc_search.py b/engines/crawler_bbc_search.py index 9a15500..ab51c9b 100644 --- a/engines/crawler_bbc_search.py +++ b/engines/crawler_bbc_search.py @@ -7,17 +7,28 @@ import urllib.parse import json import time import re +from utils.logger import logger BASE_URL = "https://www.bbc.com" TARGET_URL = urllib.parse.urljoin(BASE_URL, "/news") # The maximum number of blog posts to crawl -MAX_BLOG_LIMIT = 15 +MAX_BLOG_LIMIT = 8 -class Crawler_BBC: - def __init__(self, driver: webdriver) -> None: - self.driver = driver +# bbc在线搜索 +class Crawler_BBCSearch: + def __init__(self) -> None: + from selenium.webdriver.chrome.options import Options + + chrome_options = Options() + chrome_options.add_argument("--disable-blink-features=AutomationControlled") + chrome_options.add_argument("--ignore-certificate-errors") + chrome_options.add_argument("--ignore-ssl-errors=yes") + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + self.driver = webdriver.Chrome(options=chrome_options) self.selected_blogs = {"titles": [], "urls": []} def __search_topic(self, topic: str): @@ -26,14 +37,17 @@ class Crawler_BBC: # open up side bar to show search bar self.driver.find_element(By.CLASS_NAME, "sc-8a068d35-3.kvafkS").click() - + time.sleep(1) # input topic to be searched in search bar search_bar = self.driver.find_element(By.CLASS_NAME, "sc-e1a87ea7-1.iARAvt") search_bar.send_keys(topic) # click search button + # search_submit_button = self.driver.find_element( + # By.CLASS_NAME, "sc-f6c53a81-2.sc-f6c53a81-3.dyeOnJ.dQfGZm" + # ) search_submit_button = self.driver.find_element( - By.CLASS_NAME, "sc-f6c53a81-2.sc-f6c53a81-3.dyeOnJ.dQfGZm" + By.CSS_SELECTOR, '[data-testid="search-input-search-button"]' ) search_submit_button.click() @@ -71,27 +85,29 @@ class Crawler_BBC: except Exception: continue # skip blogs that are not news - if not "news" in url: - continue - self.selected_blogs["titles"].append(title) - # bbc's href links only contains path - if not urllib.parse.urlparse(url).netloc: - url = urllib.parse.urljoin(BASE_URL, url) - self.selected_blogs["urls"].append(url) + if "news" in url and "/videos/" not in url: + self.selected_blogs["titles"].append(title) + # bbc's href links only contains path + if not urllib.parse.urlparse(url).netloc: + url = urllib.parse.urljoin(BASE_URL, url) + self.selected_blogs["urls"].append(url) if len(self.selected_blogs["urls"]) < MAX_BLOG_LIMIT: # go to next page - next_page_btn = self.driver.find_element( + if self.driver.find_elements( By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]' - ) - try: - next_page_btn.click() - time.sleep(2) - self.__select_blog_in_search_page() - except: - import traceback + ): + next_page_btn = self.driver.find_element( + By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]' + ) + try: + next_page_btn.click() + time.sleep(2) + self.__select_blog_in_search_page() + except: + import traceback - print(traceback.format_exc()) + print(traceback.format_exc()) else: self.selected_blogs["titles"] = self.selected_blogs["titles"][ :MAX_BLOG_LIMIT @@ -108,14 +124,22 @@ class Crawler_BBC: "div", attrs={"data-component": "headline-block"}, recursive=True ).text blog_time = raw_blog.find("time", recursive=True).text - blog_contributor = raw_blog.find( - "div", attrs={"data-testid": "byline-new-contributors"}, recursive=True - ).text - blog_meta = {"time": blog_time, "author": blog_contributor} + + blog_contributor_el = raw_blog.find( + "div", + attrs={"data-testid": "byline-new-contributors"}, + recursive=True, + ) + blog_contributor = "" + if blog_contributor_el != None: + blog_contributor = blog_contributor_el.text + blog_content_blocks = raw_blog.find_all( "div", attrs={"data-component": "text-block"}, recursive=True ) + blog_meta = {"time": blog_time, "author": blog_contributor} + blog_content = "" for block in blog_content_blocks: @@ -125,9 +149,14 @@ class Crawler_BBC: def __get_and_save_blog(self, url: str): self.driver.get(url) - time.sleep(3) + # WebDriverWait(self.driver, 10).until(EC.title_contains("新华")) + div = WebDriverWait(self.driver, 10).until( + EC.presence_of_element_located((By.ID, "main-content")) + ) + time.sleep(1) blog = self.__retrieve_blog() + return blog blog_title = blog.get("title", "") print(blog_title) @@ -142,34 +171,42 @@ class Crawler_BBC: time.sleep(2) def search_and_save(self, topic: str): + content_list = [] + logger.warning(f"Crawler_BBCSearch start search topic {topic}") self.__search_topic(topic) + time.sleep(1) + logger.warning("Crawler_BBCSearch start select blog in search page") self.__select_blog_in_search_page() url_list = self.selected_blogs.get("urls", []) + logger.warning(f"Crawler_BBCSearch url_list {str(url_list)}") + + idx = 1 for url in url_list: - self.__get_and_save_blog(url) + logger.warning(f"Crawler_BBCSearch {idx}/{len(url_list)} url:{url}") + content = self.__get_and_save_blog(url) + content_list.append(content) + idx += 1 + return content_list def direct_save(self, url: str): self.__get_and_save_blog(url) - def test(self): + def process(self, inputData): + logger.warning("Crawler_BBCSearch / inputData", inputData) + keyword = inputData["keyword"] + + result = [] try: - self.search_and_save("US election") - # self.direct_save("https://www.bbc.com/news/articles/c2edewgv2kpo") + result = self.search_and_save(keyword) + # print("result", result) except Exception as e: import traceback - print(traceback.format_exc()) + logger.warning(f"Crawler_BBCSearch {traceback.format_exc()}") finally: self.driver.quit() - -if __name__ == "__main__": - from selenium.webdriver.chrome.options import Options - - chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--disable-dev-shm-usage") - driver = webdriver.Chrome(options=chrome_options) - crawler = Crawler_BBC(driver) - crawler.test() + logger.warning( + f"Crawler_BBCSearch process completed】, keyword={keyword}, result len={len(result)}" + ) + return result diff --git a/engines/crawler_newscn_search.py b/engines/crawler_newscn_search.py index 9960ef0..21f865f 100644 --- a/engines/crawler_newscn_search.py +++ b/engines/crawler_newscn_search.py @@ -14,25 +14,22 @@ from utils.logger import logger BASE_URL_EN = "https://english.news.cn" BASE_URL_CN = "https://so.news.cn/" -XINHUA_OVERSEAS_REGIONS = [ - "asiapacific", - "europe", - "africa", - "northamerica", - "german", - "20241016", -] +XINHUA_OVERSEAS_REGIONS = ["asiapacific", "europe", "africa", "northamerica", "german"] # The maximum number of blog posts to crawl # C:\Users\MillionZhang\.cache\selenium\chromedriver\win64 -MAX_BLOG_LIMIT = 10 +MAX_BLOG_LIMIT = 8 +# 新华网英文站在线搜索 class Crawler_NewsCN: def __init__(self) -> None: from selenium.webdriver.chrome.options import Options chrome_options = Options() + chrome_options.add_argument("--disable-blink-features=AutomationControlled") + chrome_options.add_argument("--ignore-certificate-errors") + chrome_options.add_argument("--ignore-ssl-errors=yes") chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") @@ -98,18 +95,19 @@ class Crawler_NewsCN: if len(self.selected_blogs["urls"]) < MAX_BLOG_LIMIT: # go to next page - next_page_btn = self.driver.find_element( - By.CLASS_NAME, "ant-pagination-next" - ) - if next_page_btn.get_attribute("aria-disabled") != "true": - try: - next_page_btn.click() - time.sleep(2) - self.__select_blog_in_search_page() - except: - import traceback + if self.driver.find_elements(By.CLASS_NAME, "ant-pagination-next"): + next_page_btn = self.driver.find_element( + By.CLASS_NAME, "ant-pagination-next" + ) + if next_page_btn.get_attribute("aria-disabled") != "true": + try: + next_page_btn.click() + time.sleep(2) + self.__select_blog_in_search_page() + except: + import traceback - print(traceback.format_exc()) + print(traceback.format_exc()) else: self.selected_blogs["titles"] = self.selected_blogs["titles"][ :MAX_BLOG_LIMIT @@ -158,19 +156,26 @@ class Crawler_NewsCN: def __get_and_save_blog(self, url: str, lang): self.driver.get(url) - WebDriverWait(self.driver, 10).until(EC.title_contains("新华")) if lang == "en": - # region_code = urllib.parse.urlparse(url).path.split("/")[1] - # if region_code in XINHUA_OVERSEAS_REGIONS: - # blog = self.__retrieve_overseas_blog() - # else: - # blog = self.__retrieve_china_blog() + WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua")) + # time.sleep(1) - blog = self.__retrieve_overseas_blog() - blog_title = blog.get("title", "") + region_code = urllib.parse.urlparse(url).path.split("/")[1] + if region_code in XINHUA_OVERSEAS_REGIONS: + blog = self.__retrieve_overseas_blog() + else: + blog = self.__retrieve_china_blog() + + # div = WebDriverWait(self.driver, 10).until( + # EC.presence_of_element_located((By.CLASS_NAME, "detailContent")) + # ) + # blog = self.__retrieve_overseas_blog() + + # blog_title = blog.get("title", "") else: if lang == "cn": + WebDriverWait(self.driver, 10).until(EC.title_contains("新华")) blog = self.__retrieve_cn_blog() blog_title = blog.get("title", "") # print(blog_title) @@ -187,15 +192,18 @@ class Crawler_NewsCN: def search_and_save(self, topic: str, lang): content_list = [] + logger.warning(f"Crawler_NewsCN start search topic {topic} {lang}") self.__search_topic(topic, lang) time.sleep(1) + logger.warning("Crawler_NewsCN start select blog in search page") self.__select_blog_in_search_page() # print(self.selected_blogs) url_list = self.selected_blogs.get("urls", []) + logger.warning(f"Crawler_NewsCN url_list {str(url_list)}") idx = 1 for url in url_list: - logger.warning(f"{idx}/{len(url_list)} url:{url}") + logger.warning(f"Crawler_NewsCN {idx}/{len(url_list)} url:{url}") content = self.__get_and_save_blog(url, lang) content_list.append(content) idx += 1 @@ -205,9 +213,7 @@ class Crawler_NewsCN: self.__get_and_save_blog(url) def process(self, inputData): - # return {"full_name": "xxx", "date_of_birth": "1956-01-01"} - - print("Crawler_NewsCN / inputData", inputData) + logger.warning("Crawler_NewsCN / inputData", inputData) keyword = inputData["keyword"] lang = inputData["lang"] @@ -218,9 +224,9 @@ class Crawler_NewsCN: except Exception as e: import traceback - print(traceback.format_exc()) - # finally: - # self.driver.quit() + logger.warning(f"Crawler_NewsCN {traceback.format_exc()}") + finally: + self.driver.quit() logger.warning( f"【Crawler_NewsCN process completed】, keyword={keyword}, result len={len(result)}" diff --git a/main.py b/main.py index adb021b..a6bf12b 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ from engines.crawler_google_search import CrawlerGoogleSearch from engines.crawler_newscn_search import Crawler_NewsCN from engines.info_extractor import InfoExtractor +from engines.crawler_bbc_search import Crawler_BBCSearch from utils.logger import logger import os, sys, time, traceback, json @@ -10,29 +11,26 @@ sys.path.append(relative_path) from queue_processor import QueueProcessor -infoExtractor = InfoExtractor() -crawlerGoogleSearch = CrawlerGoogleSearch() -crawler_NewsCN = Crawler_NewsCN() - class Main(QueueProcessor): def processor_handle(self, input): - print("input:", input) # {'keyword': '林郑月娥' } + print("input:", input) currentEngineId = input["currentEngineId"] inputData = json.loads(input["inputData"]) match currentEngineId: case 3000: # InfoExtractor 实体信息提取 - return infoExtractor.process(inputData) - case 9000: # crawler_google_search google在线所搜 - return crawlerGoogleSearch.process(inputData) - case 10000: # crawler_bbc_search bbc在线所搜 - print(2) + return InfoExtractor().process(inputData) + case 9000: # crawler_google_search google在线搜索 + return CrawlerGoogleSearch().process(inputData) + case 10000: # crawler_bbc_search bbc在线搜索 + return Crawler_BBCSearch().process(inputData) case 11000: # crawler_wikipedia print(3) case 12000: # crawler_webb_site print(4) case 13000: # crawler_NewsCN 新华网英文站在线搜索 - return crawler_NewsCN.process(inputData) + # crawler_NewsCN = Crawler_NewsCN() + return Crawler_NewsCN().process(inputData) if __name__ == "__main__": diff --git a/submit_test.py b/submit_test.py index 676c6a9..b578811 100644 --- a/submit_test.py +++ b/submit_test.py @@ -17,8 +17,11 @@ class submit_test: # # 创建一个测试队列:9000 / GoogleSearch # submit_test.submit(9000, {"keyword": keyword}) -# # 创建一个测试队列:10000 / BBCSearch -# submit_test.submit(10000, {"keyword": keyword}) +# # # 创建一个测试队列:10000 / BBCSearch +submit_test.submit(10000, {"keyword": "习近平"}) +submit_test.submit(10000, {"keyword": "US election"}) +# submit_test.submit(10000, {"keyword": "US election"}) +# submit_test.submit(10000, {"keyword": "US election"}) # # 创建一个测试队列:11000 / Wikipedia # submit_test.submit(11000, {"keyword": keyword})