CrawlerEngines/engines/crawler_google_search.py

from bs4 import BeautifulSoup as soup
import googlesearch
from utils.logger import logger
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
MAX_BLOG_LIMIT = 8

class Crawler_GoogleSearch:
    def __init__(self) -> None:
        from selenium.webdriver.chrome.options import Options

        chrome_options = Options()
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_argument("--ignore-certificate-errors")
        chrome_options.add_argument("--ignore-ssl-errors=yes")
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        self.driver = webdriver.Chrome(options=chrome_options)

    def __google_search(self, topic):
        # return list of urls
        return [i for i in googlesearch.search(topic)]

    def __get_and_save(self, url):
        self.driver.get(url)
        WebDriverWait(self.driver, 10)
        blog_container = self.driver.find_element(By.TAG_NAME, "html")
        raw_blog = soup(
            blog_container.get_attribute("innerHTML"), features="html.parser"
        )

        for data in raw_blog(['style', 'script']):
        # Remove tags
            data.decompose()
        # return data by retrieving the tag content
        return ' '.join(soup.stripped_strings)


    def search_and_save(self, keyword):
        url_list = self.__google_search(keyword)
        content_list = []
        idx = 1
        for url in url_list:
            logger.warning(f"Crawler_GoogleSearch {idx}/{len(url_list)} url:{url}")
            content = self.__get_and_save(url)
            content_list.append(content)
            idx += 1
        return content_list

    def process(self, inputData):
        logger.warning("Crawler_GoogleSearch / inputData", inputData)
        keyword = inputData["keyword"]

        result = []
        try:
            result = self.search_and_save(keyword)
        except Exception:
            import traceback
            logger.warning(f"Crawler_GoogleSearch {traceback.format_exc()}")

        logger.warning(
            f"Crawler_GoogleSearch process completed】, keyword={keyword}, result len={len(result)}"
        )
        return result

if __name__ == "__main__":
    print([i for i in googlesearch.search("wang zhi xu")])