1
0
Fork 0
CrawlerEngines/engines/crawler_google_search.py

70 lines
2.5 KiB
Python

from bs4 import BeautifulSoup as soup
import googlesearch
from utils.logger import logger
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
MAX_BLOG_LIMIT = 8
class Crawler_GoogleSearch:
def __init__(self) -> None:
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--ignore-ssl-errors=yes")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
self.driver = webdriver.Chrome(options=chrome_options)
def __google_search(self, topic):
# return list of urls
return [i for i in googlesearch.search(topic)]
def __get_and_save(self, url):
self.driver.get(url)
WebDriverWait(self.driver, 10)
blog_container = self.driver.find_element(By.TAG_NAME, "html")
raw_blog = soup(
blog_container.get_attribute("innerHTML"), features="html.parser"
)
for data in raw_blog(['style', 'script']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return ' '.join(soup.stripped_strings)
def search_and_save(self, keyword):
url_list = self.__google_search(keyword)
content_list = []
idx = 1
for url in url_list:
logger.warning(f"Crawler_GoogleSearch {idx}/{len(url_list)} url:{url}")
content = self.__get_and_save(url)
content_list.append(content)
idx += 1
return content_list
def process(self, inputData):
logger.warning("Crawler_GoogleSearch / inputData", inputData)
keyword = inputData["keyword"]
result = []
try:
result = self.search_and_save(keyword)
except Exception:
import traceback
logger.warning(f"Crawler_GoogleSearch {traceback.format_exc()}")
logger.warning(
f"Crawler_GoogleSearch process completed】, keyword={keyword}, result len={len(result)}"
)
return result
if __name__ == "__main__":
print([i for i in googlesearch.search("wang zhi xu")])