forked from iCON/CrawlerEngines
更新google search
parent
17492cbb5e
commit
a0256f27e4
|
|
@ -1,6 +1,70 @@
|
||||||
class CrawlerGoogleSearch:
|
from bs4 import BeautifulSoup as soup
|
||||||
def process(inputData):
|
import googlesearch
|
||||||
print("CrawlerGoogleSearch / inputData", inputData)
|
from utils.logger import logger
|
||||||
# TODO 具体的实现逻辑
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.wait import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
MAX_BLOG_LIMIT = 8
|
||||||
|
|
||||||
return {"full_name": "xxx", "date_of_birth": "1956-01-01"}
|
class Crawler_GoogleSearch:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
|
chrome_options.add_argument("--ignore-certificate-errors")
|
||||||
|
chrome_options.add_argument("--ignore-ssl-errors=yes")
|
||||||
|
chrome_options.add_argument("--headless")
|
||||||
|
chrome_options.add_argument("--no-sandbox")
|
||||||
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||||
|
self.driver = webdriver.Chrome(options=chrome_options)
|
||||||
|
|
||||||
|
def __google_search(self, topic):
|
||||||
|
# return list of urls
|
||||||
|
return [i for i in googlesearch.search(topic)]
|
||||||
|
|
||||||
|
def __get_and_save(self, url):
|
||||||
|
self.driver.get(url)
|
||||||
|
WebDriverWait(self.driver, 10)
|
||||||
|
blog_container = self.driver.find_element(By.TAG_NAME, "html")
|
||||||
|
raw_blog = soup(
|
||||||
|
blog_container.get_attribute("innerHTML"), features="html.parser"
|
||||||
|
)
|
||||||
|
|
||||||
|
for data in raw_blog(['style', 'script']):
|
||||||
|
# Remove tags
|
||||||
|
data.decompose()
|
||||||
|
# return data by retrieving the tag content
|
||||||
|
return ' '.join(soup.stripped_strings)
|
||||||
|
|
||||||
|
|
||||||
|
def search_and_save(self, keyword):
|
||||||
|
url_list = self.__google_search(keyword)
|
||||||
|
content_list = []
|
||||||
|
idx = 1
|
||||||
|
for url in url_list:
|
||||||
|
logger.warning(f"Crawler_GoogleSearch {idx}/{len(url_list)} url:{url}")
|
||||||
|
content = self.__get_and_save(url)
|
||||||
|
content_list.append(content)
|
||||||
|
idx += 1
|
||||||
|
return content_list
|
||||||
|
|
||||||
|
def process(self, inputData):
|
||||||
|
logger.warning("Crawler_GoogleSearch / inputData", inputData)
|
||||||
|
keyword = inputData["keyword"]
|
||||||
|
|
||||||
|
result = []
|
||||||
|
try:
|
||||||
|
result = self.search_and_save(keyword)
|
||||||
|
except Exception:
|
||||||
|
import traceback
|
||||||
|
logger.warning(f"Crawler_GoogleSearch {traceback.format_exc()}")
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
|
f"Crawler_GoogleSearch process completed】, keyword={keyword}, result len={len(result)}"
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print([i for i in googlesearch.search("wang zhi xu")])
|
||||||
|
|
@ -37,7 +37,7 @@ class Crawler_Wikipedia:
|
||||||
# print("result", result)
|
# print("result", result)
|
||||||
except Exception:
|
except Exception:
|
||||||
import traceback
|
import traceback
|
||||||
logger.warning(f"Crawler_NewsCN {traceback.format_exc()}")
|
logger.warning(f"Crawler_Wikipedia {traceback.format_exc()}")
|
||||||
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Crawler_Wikipedia process completed】, keyword={keyword}, result len={len(result)}"
|
f"Crawler_Wikipedia process completed】, keyword={keyword}, result len={len(result)}"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue