From a0256f27e44928c6729aee58fe09d72c1873ed6d Mon Sep 17 00:00:00 2001 From: songtao Date: Mon, 28 Oct 2024 22:00:25 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0google=20search?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- engines/crawler_google_search.py | 74 +++++++++++++++++++++++++++++--- engines/crawler_wikipedia.py | 2 +- 2 files changed, 70 insertions(+), 6 deletions(-) diff --git a/engines/crawler_google_search.py b/engines/crawler_google_search.py index 62acfce..2a01812 100644 --- a/engines/crawler_google_search.py +++ b/engines/crawler_google_search.py @@ -1,6 +1,70 @@ -class CrawlerGoogleSearch: - def process(inputData): - print("CrawlerGoogleSearch / inputData", inputData) - # TODO 具体的实现逻辑 +from bs4 import BeautifulSoup as soup +import googlesearch +from utils.logger import logger +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.wait import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +MAX_BLOG_LIMIT = 8 - return {"full_name": "xxx", "date_of_birth": "1956-01-01"} +class Crawler_GoogleSearch: + def __init__(self) -> None: + from selenium.webdriver.chrome.options import Options + + chrome_options = Options() + chrome_options.add_argument("--disable-blink-features=AutomationControlled") + chrome_options.add_argument("--ignore-certificate-errors") + chrome_options.add_argument("--ignore-ssl-errors=yes") + chrome_options.add_argument("--headless") + chrome_options.add_argument("--no-sandbox") + chrome_options.add_argument("--disable-dev-shm-usage") + self.driver = webdriver.Chrome(options=chrome_options) + + def __google_search(self, topic): + # return list of urls + return [i for i in googlesearch.search(topic)] + + def __get_and_save(self, url): + self.driver.get(url) + WebDriverWait(self.driver, 10) + blog_container = self.driver.find_element(By.TAG_NAME, "html") + raw_blog = soup( + blog_container.get_attribute("innerHTML"), features="html.parser" + ) + + for data in raw_blog(['style', 'script']): + # Remove tags + data.decompose() + # return data by retrieving the tag content + return ' '.join(soup.stripped_strings) + + + def search_and_save(self, keyword): + url_list = self.__google_search(keyword) + content_list = [] + idx = 1 + for url in url_list: + logger.warning(f"Crawler_GoogleSearch {idx}/{len(url_list)} url:{url}") + content = self.__get_and_save(url) + content_list.append(content) + idx += 1 + return content_list + + def process(self, inputData): + logger.warning("Crawler_GoogleSearch / inputData", inputData) + keyword = inputData["keyword"] + + result = [] + try: + result = self.search_and_save(keyword) + except Exception: + import traceback + logger.warning(f"Crawler_GoogleSearch {traceback.format_exc()}") + + logger.warning( + f"Crawler_GoogleSearch process completed】, keyword={keyword}, result len={len(result)}" + ) + return result + +if __name__ == "__main__": + print([i for i in googlesearch.search("wang zhi xu")]) \ No newline at end of file diff --git a/engines/crawler_wikipedia.py b/engines/crawler_wikipedia.py index 5688191..8617f43 100644 --- a/engines/crawler_wikipedia.py +++ b/engines/crawler_wikipedia.py @@ -37,7 +37,7 @@ class Crawler_Wikipedia: # print("result", result) except Exception: import traceback - logger.warning(f"Crawler_NewsCN {traceback.format_exc()}") + logger.warning(f"Crawler_Wikipedia {traceback.format_exc()}") logger.warning( f"Crawler_Wikipedia process completed】, keyword={keyword}, result len={len(result)}"