From a0256f27e44928c6729aee58fe09d72c1873ed6d Mon Sep 17 00:00:00 2001
From: songtao <taosong107@gmail.com>
Date: Mon, 28 Oct 2024 22:00:25 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0google=20search?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 engines/crawler_google_search.py | 74 +++++++++++++++++++++++++++++---
 engines/crawler_wikipedia.py     |  2 +-
 2 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/engines/crawler_google_search.py b/engines/crawler_google_search.py
index 62acfce..2a01812 100644
--- a/engines/crawler_google_search.py
+++ b/engines/crawler_google_search.py
@@ -1,6 +1,70 @@
-class CrawlerGoogleSearch:
-    def process(inputData):
-        print("CrawlerGoogleSearch / inputData", inputData)
-        # TODO 具体的实现逻辑
+from bs4 import BeautifulSoup as soup
+import googlesearch
+from utils.logger import logger
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.wait import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+MAX_BLOG_LIMIT = 8
 
-        return {"full_name": "xxx", "date_of_birth": "1956-01-01"}
+class Crawler_GoogleSearch:
+    def __init__(self) -> None:
+        from selenium.webdriver.chrome.options import Options
+
+        chrome_options = Options()
+        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
+        chrome_options.add_argument("--ignore-certificate-errors")
+        chrome_options.add_argument("--ignore-ssl-errors=yes")
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        self.driver = webdriver.Chrome(options=chrome_options)
+
+    def __google_search(self, topic):
+        # return list of urls
+        return [i for i in googlesearch.search(topic)]
+    
+    def __get_and_save(self, url):
+        self.driver.get(url)
+        WebDriverWait(self.driver, 10)
+        blog_container = self.driver.find_element(By.TAG_NAME, "html")
+        raw_blog = soup(
+            blog_container.get_attribute("innerHTML"), features="html.parser"
+        )
+
+        for data in raw_blog(['style', 'script']):
+        # Remove tags
+            data.decompose()
+        # return data by retrieving the tag content
+        return ' '.join(soup.stripped_strings)
+
+
+    def search_and_save(self, keyword):
+        url_list = self.__google_search(keyword)
+        content_list = []
+        idx = 1
+        for url in url_list:
+            logger.warning(f"Crawler_GoogleSearch {idx}/{len(url_list)} url:{url}")
+            content = self.__get_and_save(url)
+            content_list.append(content)
+            idx += 1
+        return content_list
+
+    def process(self, inputData):
+        logger.warning("Crawler_GoogleSearch / inputData", inputData)
+        keyword = inputData["keyword"]
+
+        result = []
+        try:
+            result = self.search_and_save(keyword)
+        except Exception:
+            import traceback
+            logger.warning(f"Crawler_GoogleSearch {traceback.format_exc()}")
+
+        logger.warning(
+            f"Crawler_GoogleSearch process completed】, keyword={keyword}, result len={len(result)}"
+        )
+        return result
+
+if __name__ == "__main__":
+    print([i for i in googlesearch.search("wang zhi xu")])
\ No newline at end of file
diff --git a/engines/crawler_wikipedia.py b/engines/crawler_wikipedia.py
index 5688191..8617f43 100644
--- a/engines/crawler_wikipedia.py
+++ b/engines/crawler_wikipedia.py
@@ -37,7 +37,7 @@ class Crawler_Wikipedia:
             # print("result", result)
         except Exception:
             import traceback
-            logger.warning(f"Crawler_NewsCN {traceback.format_exc()}")
+            logger.warning(f"Crawler_Wikipedia {traceback.format_exc()}")
 
         logger.warning(
             f"Crawler_Wikipedia process completed】, keyword={keyword}, result len={len(result)}"