新华网英文站在线搜索

2024-10-17 21:18:39 +08:00 · 2024-10-17 21:18:39 +08:00 · 294cf0eba1
parent e0ae84581e
commit 294cf0eba1
4 changed files with 151 additions and 70 deletions
--- a/engines/crawler_newscn_search.py
+++ b/engines/crawler_newscn_search.py
@ -8,36 +8,71 @@ import json
 import time
 import re
 import os
+from utils.logger import logger

-BASE_URL = "https://english.news.cn"
-XINHUA_OVERSEAS_REGIONS = ["asiapacific", "europe", "africa", "northamerica"]
+# from utils.logger import logger
+
+BASE_URL_EN = "https://english.news.cn"
+BASE_URL_CN = "https://so.news.cn/"
+XINHUA_OVERSEAS_REGIONS = [
+    "asiapacific",
+    "europe",
+    "africa",
+    "northamerica",
+    "german",
+    "20241016",
+]

 # The maximum number of blog posts to crawl
 # C:\Users\MillionZhang\.cache\selenium\chromedriver\win64
-MAX_BLOG_LIMIT = 15
+MAX_BLOG_LIMIT = 10


 class Crawler_NewsCN:
-    def __init__(self, driver: webdriver) -> None:
-        self.driver = driver
+    def __init__(self) -> None:
+        from selenium.webdriver.chrome.options import Options
+
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        self.driver = webdriver.Chrome(options=chrome_options)
        self.selected_blogs = {"titles": [], "urls": []}

-    def __search_topic(self, topic: str):
-        self.driver.get(BASE_URL)
-        WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua"))
+    def __search_topic(self, topic: str, lang):
+        if lang == "en":
+            self.driver.get(BASE_URL_EN)
+            WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua"))

-        # input topic to be searched in search bar
-        search_bar = self.driver.find_element(By.CLASS_NAME, "search-input")
-        search_bar.send_keys(topic)
+            # input topic to be searched in search bar
+            search_bar = self.driver.find_element(By.CLASS_NAME, "search-input")
+            search_bar.send_keys(topic)

-        # click search button
-        search_submit_button = self.driver.find_element(By.ID, "searchSubmit")
-        search_submit_button.click()
+            # click search button
+            search_submit_button = self.driver.find_element(By.ID, "searchSubmit")
+            search_submit_button.click()

-        # close home window and switch to new window
-        self.driver.close()
-        self.driver.switch_to.window(self.driver.window_handles[0])
-        WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua research"))
+            # close home window and switch to new window
+            self.driver.close()
+            self.driver.switch_to.window(self.driver.window_handles[0])
+            WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua research"))
+        else:
+            if lang == "cn":
+                self.driver.get(BASE_URL_CN)
+                WebDriverWait(self.driver, 10).until(EC.title_contains("新华搜索"))
+
+                # input topic to be searched in search bar
+                search_bar = self.driver.find_element(By.CLASS_NAME, "input")
+                search_bar.send_keys(topic)
+
+                # click search button
+                # search_submit_button = self.driver.find_element(By.ID, "searchSubmit")
+                search_submit_button = self.driver.find_element(
+                    By.CLASS_NAME, "search-button"
+                )
+                search_submit_button.click()
+                self.driver.switch_to.window(self.driver.window_handles[0])
+                WebDriverWait(self.driver, 10).until(EC.title_contains("新华搜索"))

    def __select_blog_in_search_page(self):
        raw_blogs = self.driver.find_element(By.CLASS_NAME, "content").get_attribute(
@ -103,57 +138,91 @@ class Crawler_NewsCN:

        return {"title": blog_title, "meta": blog_meta, "content": blog_content}

-    def __get_and_save_blog(self, url: str):
+    # 当lang=cn时
+    def __retrieve_cn_blog(self) -> dict:
+        blog_title = (
+            self.driver.find_element(By.CLASS_NAME, "head-line.clearfix")
+            .find_element(By.CLASS_NAME, "title")
+            .text
+        )
+
+        blog_meta = (
+            self.driver.find_element(By.CLASS_NAME, "header-cont.clearfix")
+            .find_element(By.CLASS_NAME, "source")
+            .text
+        )
+
+        blog_content = self.driver.find_element(By.ID, "detailContent").text
+
+        return {"title": blog_title, "meta": blog_meta, "content": blog_content}
+
+    def __get_and_save_blog(self, url: str, lang):
        self.driver.get(url)
-        WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua"))
-        region_code = urllib.parse.urlparse(url).path.split("/")[1]
-        if region_code in XINHUA_OVERSEAS_REGIONS:
+        WebDriverWait(self.driver, 10).until(EC.title_contains("新华"))
+
+        if lang == "en":
+            # region_code = urllib.parse.urlparse(url).path.split("/")[1]
+            # if region_code in XINHUA_OVERSEAS_REGIONS:
+            #     blog = self.__retrieve_overseas_blog()
+            # else:
+            #     blog = self.__retrieve_china_blog()
+
            blog = self.__retrieve_overseas_blog()
+            blog_title = blog.get("title", "")
        else:
-            blog = self.__retrieve_china_blog()
-        blog_title = blog.get("title", "")
-        print(blog_title)
+            if lang == "cn":
+                blog = self.__retrieve_cn_blog()
+                blog_title = blog.get("title", "")
+        # print(blog_title)

        # Remove invalid char in file_path_name on Windows
-        invalid_chars_pattern = r'[\\/:*?"<>|]'
-        blog_title = re.sub(invalid_chars_pattern, "", blog_title)
+        # invalid_chars_pattern = r'[\\/:*?"<>|]'
+        # blog_title = re.sub(invalid_chars_pattern, "", blog_title)

-        file = open(os.path.join("", "Xinhua_{blog_title}.json"), "w")
-        json.dump(blog, file)
-        file.close()
-        time.sleep(2)
+        # file = open(os.path.join("", f"Xinhua_{blog_title}.json"), "w")
+        # json.dump(blog, file)
+        # file.close()
+        # time.sleep(2)
+        return blog

-    def search_and_save(self, topic: str):
-        self.__search_topic(topic)
+    def search_and_save(self, topic: str, lang):
+        content_list = []
+        self.__search_topic(topic, lang)
+        time.sleep(1)
        self.__select_blog_in_search_page()
-        print(self.selected_blogs)
+        # print(self.selected_blogs)
        url_list = self.selected_blogs.get("urls", [])
+
+        idx = 1
        for url in url_list:
-            self.__get_and_save_blog(url)
+            logger.warning(f"{idx}/{len(url_list)} url:{url}")
+            content = self.__get_and_save_blog(url, lang)
+            content_list.append(content)
+            idx += 1
+        return content_list

    def direct_save(self, url: str):
        self.__get_and_save_blog(url)

-    def test(self):
+    def process(self, inputData):
+        # return {"full_name": "xxx", "date_of_birth": "1956-01-01"}
+
+        print("Crawler_NewsCN / inputData", inputData)
+        keyword = inputData["keyword"]
+        lang = inputData["lang"]
+
+        result = []
        try:
-            # self.search_and_save("china")
-            self.search_and_save("xi jinping")
-            # self.direct_save("<an url>")
+            result = self.search_and_save(keyword, lang)
+            # print("result", result)
        except Exception as e:
            import traceback

            print(traceback.format_exc())
-        finally:
-            self.driver.quit()
+        # finally:
+        #     self.driver.quit()

-
-if __name__ == "__main__":
-    from selenium.webdriver.chrome.options import Options
-
-    chrome_options = Options()
-    chrome_options.add_argument("--headless")
-    chrome_options.add_argument("--no-sandbox")
-    chrome_options.add_argument("--disable-dev-shm-usage")
-    driver = webdriver.Chrome(options=chrome_options)
-    crawler = Crawler_NewsCN(driver)
-    crawler.test()
+        logger.warning(
+            f"【Crawler_NewsCN process completed】, keyword={keyword}, result len={len(result)}"
+        )
+        return result
--- a/engines/info_extractor.py
+++ b/engines/info_extractor.py
@ -0,0 +1,3 @@
+class InfoExtractor:
+    def process(self, inputData):
+        print(inputData)
--- a/main.py
+++ b/main.py
@ -1,4 +1,6 @@
 from engines.crawler_google_search import CrawlerGoogleSearch
+from engines.crawler_newscn_search import Crawler_NewsCN
+from engines.info_extractor import InfoExtractor
 from utils.logger import logger
 import os, sys, time, traceback, json

@ -8,26 +10,29 @@ sys.path.append(relative_path)

 from queue_processor import QueueProcessor

+infoExtractor = InfoExtractor()
+crawlerGoogleSearch = CrawlerGoogleSearch()
+crawler_NewsCN = Crawler_NewsCN()
+

 class Main(QueueProcessor):
    def processor_handle(self, input):
        print("input:", input)  # {'keyword': '林郑月娥' }
-
        currentEngineId = input["currentEngineId"]
        inputData = json.loads(input["inputData"])
-        # keyword = inputData["keyword"]
-        # print("keyword:", keyword)
        match currentEngineId:
-            case 3000:  # crawler_bbc_search
-                return CrawlerGoogleSearch.process(inputData)
-            case 9000:  # crawler_bbc_search
-                return CrawlerGoogleSearch.process(inputData)
-            case 10000:  # crawler_bbc_search
+            case 3000:  # InfoExtractor 实体信息提取
+                return infoExtractor.process(inputData)
+            case 9000:  # crawler_google_search google在线所搜
+                return crawlerGoogleSearch.process(inputData)
+            case 10000:  # crawler_bbc_search bbc在线所搜
                print(2)
            case 11000:  # crawler_wikipedia
                print(3)
            case 12000:  # crawler_webb_site
                print(4)
+            case 13000:  # crawler_NewsCN 新华网英文站在线搜索
+                return crawler_NewsCN.process(inputData)


 if __name__ == "__main__":
--- a/submit_test.py
+++ b/submit_test.py
@ -8,20 +8,24 @@ from queue_client import QueueClient


 class submit_test:
-    def submit(code, keyword):
+    def submit(code, data):
        client = QueueClient()
-        returnData = client.call(code, {"keyword": keyword})
+        returnData = client.call(code, data)
        print(returnData)


-# 创建一个测试队列：9000 / GoogleSearch
-submit_test.submit(9000, "林郑月娥")
+# # 创建一个测试队列：9000 / GoogleSearch
+# submit_test.submit(9000, {"keyword": keyword})

-# 创建一个测试队列：10000 / BBCSearch
-submit_test.submit(10000, "林郑月娥")
+# # 创建一个测试队列：10000 / BBCSearch
+# submit_test.submit(10000, {"keyword": keyword})

-# 创建一个测试队列：11000 / Wikipedia
-submit_test.submit(11000, "林郑月娥")
+# # 创建一个测试队列：11000 / Wikipedia
+# submit_test.submit(11000, {"keyword": keyword})

-# 创建一个测试队列：12000 / WebbSite
-submit_test.submit(12000, "林郑月娥")
+# # 创建一个测试队列：12000 / WebbSite
+# submit_test.submit(12000, {"keyword": keyword})
+
+# 创建一个测试队列：13000 / Crawler_NewsCN 新华网英文站在线搜索
+submit_test.submit(13000, {"keyword": "china", "lang": "en"})
+submit_test.submit(13000, {"keyword": "中国", "lang": "cn"})