from bs4 import BeautifulSoup as soup import googlesearch from utils.logger import logger from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC MAX_BLOG_LIMIT = 8 class Crawler_GoogleSearch: def __init__(self) -> None: from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument("--disable-blink-features=AutomationControlled") chrome_options.add_argument("--ignore-certificate-errors") chrome_options.add_argument("--ignore-ssl-errors=yes") chrome_options.add_argument("--headless") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-dev-shm-usage") self.driver = webdriver.Chrome(options=chrome_options) def __google_search(self, topic): # return list of urls return [i for i in googlesearch.search(topic)] def __get_and_save(self, url): self.driver.get(url) WebDriverWait(self.driver, 10) blog_container = self.driver.find_element(By.TAG_NAME, "html") raw_blog = soup( blog_container.get_attribute("innerHTML"), features="html.parser" ) for data in raw_blog(['style', 'script']): # Remove tags data.decompose() # return data by retrieving the tag content return ' '.join(soup.stripped_strings) def search_and_save(self, keyword): url_list = self.__google_search(keyword) content_list = [] idx = 1 for url in url_list: logger.warning(f"Crawler_GoogleSearch {idx}/{len(url_list)} url:{url}") content = self.__get_and_save(url) content_list.append(content) idx += 1 return content_list def process(self, inputData): logger.warning("Crawler_GoogleSearch / inputData", inputData) keyword = inputData["keyword"] result = [] try: result = self.search_and_save(keyword) except Exception: import traceback logger.warning(f"Crawler_GoogleSearch {traceback.format_exc()}") logger.warning( f"Crawler_GoogleSearch process completed】, keyword={keyword}, result len={len(result)}" ) return result if __name__ == "__main__": print([i for i in googlesearch.search("wang zhi xu")])