diff --git a/engines/crawler_newscn_search.py b/engines/crawler_newscn_search.py index 4c8e1ca..9960ef0 100644 --- a/engines/crawler_newscn_search.py +++ b/engines/crawler_newscn_search.py @@ -8,36 +8,71 @@ import json import time import re import os +from utils.logger import logger -BASE_URL = "https://english.news.cn" -XINHUA_OVERSEAS_REGIONS = ["asiapacific", "europe", "africa", "northamerica"] +# from utils.logger import logger + +BASE_URL_EN = "https://english.news.cn" +BASE_URL_CN = "https://so.news.cn/" +XINHUA_OVERSEAS_REGIONS = [ + "asiapacific", + "europe", + "africa", + "northamerica", + "german", + "20241016", +] # The maximum number of blog posts to crawl # C:\Users\MillionZhang\.cache\selenium\chromedriver\win64 -MAX_BLOG_LIMIT = 15 +MAX_BLOG_LIMIT = 10 class Crawler_NewsCN: - def __init__(self, driver: webdriver) -> None: - self.driver = driver + def __init__(self) -> None: + from selenium.webdriver.chrome.options import Options + + chrome_options = Options() + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + self.driver = webdriver.Chrome(options=chrome_options) self.selected_blogs = {"titles": [], "urls": []} - def __search_topic(self, topic: str): - self.driver.get(BASE_URL) - WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua")) + def __search_topic(self, topic: str, lang): + if lang == "en": + self.driver.get(BASE_URL_EN) + WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua")) - # input topic to be searched in search bar - search_bar = self.driver.find_element(By.CLASS_NAME, "search-input") - search_bar.send_keys(topic) + # input topic to be searched in search bar + search_bar = self.driver.find_element(By.CLASS_NAME, "search-input") + search_bar.send_keys(topic) - # click search button - search_submit_button = self.driver.find_element(By.ID, "searchSubmit") - search_submit_button.click() + # click search button + search_submit_button = self.driver.find_element(By.ID, "searchSubmit") + search_submit_button.click() - # close home window and switch to new window - self.driver.close() - self.driver.switch_to.window(self.driver.window_handles[0]) - WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua research")) + # close home window and switch to new window + self.driver.close() + self.driver.switch_to.window(self.driver.window_handles[0]) + WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua research")) + else: + if lang == "cn": + self.driver.get(BASE_URL_CN) + WebDriverWait(self.driver, 10).until(EC.title_contains("新华搜索")) + + # input topic to be searched in search bar + search_bar = self.driver.find_element(By.CLASS_NAME, "input") + search_bar.send_keys(topic) + + # click search button + # search_submit_button = self.driver.find_element(By.ID, "searchSubmit") + search_submit_button = self.driver.find_element( + By.CLASS_NAME, "search-button" + ) + search_submit_button.click() + self.driver.switch_to.window(self.driver.window_handles[0]) + WebDriverWait(self.driver, 10).until(EC.title_contains("新华搜索")) def __select_blog_in_search_page(self): raw_blogs = self.driver.find_element(By.CLASS_NAME, "content").get_attribute( @@ -103,57 +138,91 @@ class Crawler_NewsCN: return {"title": blog_title, "meta": blog_meta, "content": blog_content} - def __get_and_save_blog(self, url: str): + # 当lang=cn时 + def __retrieve_cn_blog(self) -> dict: + blog_title = ( + self.driver.find_element(By.CLASS_NAME, "head-line.clearfix") + .find_element(By.CLASS_NAME, "title") + .text + ) + + blog_meta = ( + self.driver.find_element(By.CLASS_NAME, "header-cont.clearfix") + .find_element(By.CLASS_NAME, "source") + .text + ) + + blog_content = self.driver.find_element(By.ID, "detailContent").text + + return {"title": blog_title, "meta": blog_meta, "content": blog_content} + + def __get_and_save_blog(self, url: str, lang): self.driver.get(url) - WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua")) - region_code = urllib.parse.urlparse(url).path.split("/")[1] - if region_code in XINHUA_OVERSEAS_REGIONS: + WebDriverWait(self.driver, 10).until(EC.title_contains("新华")) + + if lang == "en": + # region_code = urllib.parse.urlparse(url).path.split("/")[1] + # if region_code in XINHUA_OVERSEAS_REGIONS: + # blog = self.__retrieve_overseas_blog() + # else: + # blog = self.__retrieve_china_blog() + blog = self.__retrieve_overseas_blog() + blog_title = blog.get("title", "") else: - blog = self.__retrieve_china_blog() - blog_title = blog.get("title", "") - print(blog_title) + if lang == "cn": + blog = self.__retrieve_cn_blog() + blog_title = blog.get("title", "") + # print(blog_title) # Remove invalid char in file_path_name on Windows - invalid_chars_pattern = r'[\\/:*?"<>|]' - blog_title = re.sub(invalid_chars_pattern, "", blog_title) + # invalid_chars_pattern = r'[\\/:*?"<>|]' + # blog_title = re.sub(invalid_chars_pattern, "", blog_title) - file = open(os.path.join("", "Xinhua_{blog_title}.json"), "w") - json.dump(blog, file) - file.close() - time.sleep(2) + # file = open(os.path.join("", f"Xinhua_{blog_title}.json"), "w") + # json.dump(blog, file) + # file.close() + # time.sleep(2) + return blog - def search_and_save(self, topic: str): - self.__search_topic(topic) + def search_and_save(self, topic: str, lang): + content_list = [] + self.__search_topic(topic, lang) + time.sleep(1) self.__select_blog_in_search_page() - print(self.selected_blogs) + # print(self.selected_blogs) url_list = self.selected_blogs.get("urls", []) + + idx = 1 for url in url_list: - self.__get_and_save_blog(url) + logger.warning(f"{idx}/{len(url_list)} url:{url}") + content = self.__get_and_save_blog(url, lang) + content_list.append(content) + idx += 1 + return content_list def direct_save(self, url: str): self.__get_and_save_blog(url) - def test(self): + def process(self, inputData): + # return {"full_name": "xxx", "date_of_birth": "1956-01-01"} + + print("Crawler_NewsCN / inputData", inputData) + keyword = inputData["keyword"] + lang = inputData["lang"] + + result = [] try: - # self.search_and_save("china") - self.search_and_save("xi jinping") - # self.direct_save("") + result = self.search_and_save(keyword, lang) + # print("result", result) except Exception as e: import traceback print(traceback.format_exc()) - finally: - self.driver.quit() + # finally: + # self.driver.quit() - -if __name__ == "__main__": - from selenium.webdriver.chrome.options import Options - - chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--disable-dev-shm-usage") - driver = webdriver.Chrome(options=chrome_options) - crawler = Crawler_NewsCN(driver) - crawler.test() + logger.warning( + f"【Crawler_NewsCN process completed】, keyword={keyword}, result len={len(result)}" + ) + return result diff --git a/engines/info_extractor.py b/engines/info_extractor.py new file mode 100644 index 0000000..223cc49 --- /dev/null +++ b/engines/info_extractor.py @@ -0,0 +1,3 @@ +class InfoExtractor: + def process(self, inputData): + print(inputData) diff --git a/main.py b/main.py index 0b59654..adb021b 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,6 @@ from engines.crawler_google_search import CrawlerGoogleSearch +from engines.crawler_newscn_search import Crawler_NewsCN +from engines.info_extractor import InfoExtractor from utils.logger import logger import os, sys, time, traceback, json @@ -8,26 +10,29 @@ sys.path.append(relative_path) from queue_processor import QueueProcessor +infoExtractor = InfoExtractor() +crawlerGoogleSearch = CrawlerGoogleSearch() +crawler_NewsCN = Crawler_NewsCN() + class Main(QueueProcessor): def processor_handle(self, input): print("input:", input) # {'keyword': '林郑月娥' } - currentEngineId = input["currentEngineId"] inputData = json.loads(input["inputData"]) - # keyword = inputData["keyword"] - # print("keyword:", keyword) match currentEngineId: - case 3000: # crawler_bbc_search - return CrawlerGoogleSearch.process(inputData) - case 9000: # crawler_bbc_search - return CrawlerGoogleSearch.process(inputData) - case 10000: # crawler_bbc_search + case 3000: # InfoExtractor 实体信息提取 + return infoExtractor.process(inputData) + case 9000: # crawler_google_search google在线所搜 + return crawlerGoogleSearch.process(inputData) + case 10000: # crawler_bbc_search bbc在线所搜 print(2) case 11000: # crawler_wikipedia print(3) case 12000: # crawler_webb_site print(4) + case 13000: # crawler_NewsCN 新华网英文站在线搜索 + return crawler_NewsCN.process(inputData) if __name__ == "__main__": diff --git a/submit_test.py b/submit_test.py index 2a67f02..676c6a9 100644 --- a/submit_test.py +++ b/submit_test.py @@ -8,20 +8,24 @@ from queue_client import QueueClient class submit_test: - def submit(code, keyword): + def submit(code, data): client = QueueClient() - returnData = client.call(code, {"keyword": keyword}) + returnData = client.call(code, data) print(returnData) -# 创建一个测试队列:9000 / GoogleSearch -submit_test.submit(9000, "林郑月娥") +# # 创建一个测试队列:9000 / GoogleSearch +# submit_test.submit(9000, {"keyword": keyword}) -# 创建一个测试队列:10000 / BBCSearch -submit_test.submit(10000, "林郑月娥") +# # 创建一个测试队列:10000 / BBCSearch +# submit_test.submit(10000, {"keyword": keyword}) -# 创建一个测试队列:11000 / Wikipedia -submit_test.submit(11000, "林郑月娥") +# # 创建一个测试队列:11000 / Wikipedia +# submit_test.submit(11000, {"keyword": keyword}) -# 创建一个测试队列:12000 / WebbSite -submit_test.submit(12000, "林郑月娥") +# # 创建一个测试队列:12000 / WebbSite +# submit_test.submit(12000, {"keyword": keyword}) + +# 创建一个测试队列:13000 / Crawler_NewsCN 新华网英文站在线搜索 +submit_test.submit(13000, {"keyword": "china", "lang": "en"}) +submit_test.submit(13000, {"keyword": "中国", "lang": "cn"})