1
0
Fork 0
dev
million 2024-10-18 20:02:57 +08:00
parent 294cf0eba1
commit 36353926de
4 changed files with 135 additions and 91 deletions

View File

@ -7,17 +7,28 @@ import urllib.parse
import json import json
import time import time
import re import re
from utils.logger import logger
BASE_URL = "https://www.bbc.com" BASE_URL = "https://www.bbc.com"
TARGET_URL = urllib.parse.urljoin(BASE_URL, "/news") TARGET_URL = urllib.parse.urljoin(BASE_URL, "/news")
# The maximum number of blog posts to crawl # The maximum number of blog posts to crawl
MAX_BLOG_LIMIT = 15 MAX_BLOG_LIMIT = 8
class Crawler_BBC: # bbc在线搜索
def __init__(self, driver: webdriver) -> None: class Crawler_BBCSearch:
self.driver = driver def __init__(self) -> None:
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--ignore-ssl-errors=yes")
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
self.driver = webdriver.Chrome(options=chrome_options)
self.selected_blogs = {"titles": [], "urls": []} self.selected_blogs = {"titles": [], "urls": []}
def __search_topic(self, topic: str): def __search_topic(self, topic: str):
@ -26,14 +37,17 @@ class Crawler_BBC:
# open up side bar to show search bar # open up side bar to show search bar
self.driver.find_element(By.CLASS_NAME, "sc-8a068d35-3.kvafkS").click() self.driver.find_element(By.CLASS_NAME, "sc-8a068d35-3.kvafkS").click()
time.sleep(1)
# input topic to be searched in search bar # input topic to be searched in search bar
search_bar = self.driver.find_element(By.CLASS_NAME, "sc-e1a87ea7-1.iARAvt") search_bar = self.driver.find_element(By.CLASS_NAME, "sc-e1a87ea7-1.iARAvt")
search_bar.send_keys(topic) search_bar.send_keys(topic)
# click search button # click search button
# search_submit_button = self.driver.find_element(
# By.CLASS_NAME, "sc-f6c53a81-2.sc-f6c53a81-3.dyeOnJ.dQfGZm"
# )
search_submit_button = self.driver.find_element( search_submit_button = self.driver.find_element(
By.CLASS_NAME, "sc-f6c53a81-2.sc-f6c53a81-3.dyeOnJ.dQfGZm" By.CSS_SELECTOR, '[data-testid="search-input-search-button"]'
) )
search_submit_button.click() search_submit_button.click()
@ -71,27 +85,29 @@ class Crawler_BBC:
except Exception: except Exception:
continue continue
# skip blogs that are not news # skip blogs that are not news
if not "news" in url: if "news" in url and "/videos/" not in url:
continue self.selected_blogs["titles"].append(title)
self.selected_blogs["titles"].append(title) # bbc's href links only contains path
# bbc's href links only contains path if not urllib.parse.urlparse(url).netloc:
if not urllib.parse.urlparse(url).netloc: url = urllib.parse.urljoin(BASE_URL, url)
url = urllib.parse.urljoin(BASE_URL, url) self.selected_blogs["urls"].append(url)
self.selected_blogs["urls"].append(url)
if len(self.selected_blogs["urls"]) < MAX_BLOG_LIMIT: if len(self.selected_blogs["urls"]) < MAX_BLOG_LIMIT:
# go to next page # go to next page
next_page_btn = self.driver.find_element( if self.driver.find_elements(
By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]' By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]'
) ):
try: next_page_btn = self.driver.find_element(
next_page_btn.click() By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]'
time.sleep(2) )
self.__select_blog_in_search_page() try:
except: next_page_btn.click()
import traceback time.sleep(2)
self.__select_blog_in_search_page()
except:
import traceback
print(traceback.format_exc()) print(traceback.format_exc())
else: else:
self.selected_blogs["titles"] = self.selected_blogs["titles"][ self.selected_blogs["titles"] = self.selected_blogs["titles"][
:MAX_BLOG_LIMIT :MAX_BLOG_LIMIT
@ -108,14 +124,22 @@ class Crawler_BBC:
"div", attrs={"data-component": "headline-block"}, recursive=True "div", attrs={"data-component": "headline-block"}, recursive=True
).text ).text
blog_time = raw_blog.find("time", recursive=True).text blog_time = raw_blog.find("time", recursive=True).text
blog_contributor = raw_blog.find(
"div", attrs={"data-testid": "byline-new-contributors"}, recursive=True blog_contributor_el = raw_blog.find(
).text "div",
blog_meta = {"time": blog_time, "author": blog_contributor} attrs={"data-testid": "byline-new-contributors"},
recursive=True,
)
blog_contributor = ""
if blog_contributor_el != None:
blog_contributor = blog_contributor_el.text
blog_content_blocks = raw_blog.find_all( blog_content_blocks = raw_blog.find_all(
"div", attrs={"data-component": "text-block"}, recursive=True "div", attrs={"data-component": "text-block"}, recursive=True
) )
blog_meta = {"time": blog_time, "author": blog_contributor}
blog_content = "" blog_content = ""
for block in blog_content_blocks: for block in blog_content_blocks:
@ -125,9 +149,14 @@ class Crawler_BBC:
def __get_and_save_blog(self, url: str): def __get_and_save_blog(self, url: str):
self.driver.get(url) self.driver.get(url)
time.sleep(3) # WebDriverWait(self.driver, 10).until(EC.title_contains("新华"))
div = WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.ID, "main-content"))
)
time.sleep(1)
blog = self.__retrieve_blog() blog = self.__retrieve_blog()
return blog
blog_title = blog.get("title", "") blog_title = blog.get("title", "")
print(blog_title) print(blog_title)
@ -142,34 +171,42 @@ class Crawler_BBC:
time.sleep(2) time.sleep(2)
def search_and_save(self, topic: str): def search_and_save(self, topic: str):
content_list = []
logger.warning(f"Crawler_BBCSearch start search topic {topic}")
self.__search_topic(topic) self.__search_topic(topic)
time.sleep(1)
logger.warning("Crawler_BBCSearch start select blog in search page")
self.__select_blog_in_search_page() self.__select_blog_in_search_page()
url_list = self.selected_blogs.get("urls", []) url_list = self.selected_blogs.get("urls", [])
logger.warning(f"Crawler_BBCSearch url_list {str(url_list)}")
idx = 1
for url in url_list: for url in url_list:
self.__get_and_save_blog(url) logger.warning(f"Crawler_BBCSearch {idx}/{len(url_list)} url:{url}")
content = self.__get_and_save_blog(url)
content_list.append(content)
idx += 1
return content_list
def direct_save(self, url: str): def direct_save(self, url: str):
self.__get_and_save_blog(url) self.__get_and_save_blog(url)
def test(self): def process(self, inputData):
logger.warning("Crawler_BBCSearch / inputData", inputData)
keyword = inputData["keyword"]
result = []
try: try:
self.search_and_save("US election") result = self.search_and_save(keyword)
# self.direct_save("https://www.bbc.com/news/articles/c2edewgv2kpo") # print("result", result)
except Exception as e: except Exception as e:
import traceback import traceback
print(traceback.format_exc()) logger.warning(f"Crawler_BBCSearch {traceback.format_exc()}")
finally: finally:
self.driver.quit() self.driver.quit()
logger.warning(
if __name__ == "__main__": f"Crawler_BBCSearch process completed】, keyword={keyword}, result len={len(result)}"
from selenium.webdriver.chrome.options import Options )
return result
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
driver = webdriver.Chrome(options=chrome_options)
crawler = Crawler_BBC(driver)
crawler.test()

View File

@ -14,25 +14,22 @@ from utils.logger import logger
BASE_URL_EN = "https://english.news.cn" BASE_URL_EN = "https://english.news.cn"
BASE_URL_CN = "https://so.news.cn/" BASE_URL_CN = "https://so.news.cn/"
XINHUA_OVERSEAS_REGIONS = [ XINHUA_OVERSEAS_REGIONS = ["asiapacific", "europe", "africa", "northamerica", "german"]
"asiapacific",
"europe",
"africa",
"northamerica",
"german",
"20241016",
]
# The maximum number of blog posts to crawl # The maximum number of blog posts to crawl
# C:\Users\MillionZhang\.cache\selenium\chromedriver\win64 # C:\Users\MillionZhang\.cache\selenium\chromedriver\win64
MAX_BLOG_LIMIT = 10 MAX_BLOG_LIMIT = 8
# 新华网英文站在线搜索
class Crawler_NewsCN: class Crawler_NewsCN:
def __init__(self) -> None: def __init__(self) -> None:
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
chrome_options = Options() chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--ignore-ssl-errors=yes")
chrome_options.add_argument("--headless") chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-dev-shm-usage")
@ -98,18 +95,19 @@ class Crawler_NewsCN:
if len(self.selected_blogs["urls"]) < MAX_BLOG_LIMIT: if len(self.selected_blogs["urls"]) < MAX_BLOG_LIMIT:
# go to next page # go to next page
next_page_btn = self.driver.find_element( if self.driver.find_elements(By.CLASS_NAME, "ant-pagination-next"):
By.CLASS_NAME, "ant-pagination-next" next_page_btn = self.driver.find_element(
) By.CLASS_NAME, "ant-pagination-next"
if next_page_btn.get_attribute("aria-disabled") != "true": )
try: if next_page_btn.get_attribute("aria-disabled") != "true":
next_page_btn.click() try:
time.sleep(2) next_page_btn.click()
self.__select_blog_in_search_page() time.sleep(2)
except: self.__select_blog_in_search_page()
import traceback except:
import traceback
print(traceback.format_exc()) print(traceback.format_exc())
else: else:
self.selected_blogs["titles"] = self.selected_blogs["titles"][ self.selected_blogs["titles"] = self.selected_blogs["titles"][
:MAX_BLOG_LIMIT :MAX_BLOG_LIMIT
@ -158,19 +156,26 @@ class Crawler_NewsCN:
def __get_and_save_blog(self, url: str, lang): def __get_and_save_blog(self, url: str, lang):
self.driver.get(url) self.driver.get(url)
WebDriverWait(self.driver, 10).until(EC.title_contains("新华"))
if lang == "en": if lang == "en":
# region_code = urllib.parse.urlparse(url).path.split("/")[1] WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua"))
# if region_code in XINHUA_OVERSEAS_REGIONS: # time.sleep(1)
# blog = self.__retrieve_overseas_blog()
# else:
# blog = self.__retrieve_china_blog()
blog = self.__retrieve_overseas_blog() region_code = urllib.parse.urlparse(url).path.split("/")[1]
blog_title = blog.get("title", "") if region_code in XINHUA_OVERSEAS_REGIONS:
blog = self.__retrieve_overseas_blog()
else:
blog = self.__retrieve_china_blog()
# div = WebDriverWait(self.driver, 10).until(
# EC.presence_of_element_located((By.CLASS_NAME, "detailContent"))
# )
# blog = self.__retrieve_overseas_blog()
# blog_title = blog.get("title", "")
else: else:
if lang == "cn": if lang == "cn":
WebDriverWait(self.driver, 10).until(EC.title_contains("新华"))
blog = self.__retrieve_cn_blog() blog = self.__retrieve_cn_blog()
blog_title = blog.get("title", "") blog_title = blog.get("title", "")
# print(blog_title) # print(blog_title)
@ -187,15 +192,18 @@ class Crawler_NewsCN:
def search_and_save(self, topic: str, lang): def search_and_save(self, topic: str, lang):
content_list = [] content_list = []
logger.warning(f"Crawler_NewsCN start search topic {topic} {lang}")
self.__search_topic(topic, lang) self.__search_topic(topic, lang)
time.sleep(1) time.sleep(1)
logger.warning("Crawler_NewsCN start select blog in search page")
self.__select_blog_in_search_page() self.__select_blog_in_search_page()
# print(self.selected_blogs) # print(self.selected_blogs)
url_list = self.selected_blogs.get("urls", []) url_list = self.selected_blogs.get("urls", [])
logger.warning(f"Crawler_NewsCN url_list {str(url_list)}")
idx = 1 idx = 1
for url in url_list: for url in url_list:
logger.warning(f"{idx}/{len(url_list)} url:{url}") logger.warning(f"Crawler_NewsCN {idx}/{len(url_list)} url:{url}")
content = self.__get_and_save_blog(url, lang) content = self.__get_and_save_blog(url, lang)
content_list.append(content) content_list.append(content)
idx += 1 idx += 1
@ -205,9 +213,7 @@ class Crawler_NewsCN:
self.__get_and_save_blog(url) self.__get_and_save_blog(url)
def process(self, inputData): def process(self, inputData):
# return {"full_name": "xxx", "date_of_birth": "1956-01-01"} logger.warning("Crawler_NewsCN / inputData", inputData)
print("Crawler_NewsCN / inputData", inputData)
keyword = inputData["keyword"] keyword = inputData["keyword"]
lang = inputData["lang"] lang = inputData["lang"]
@ -218,9 +224,9 @@ class Crawler_NewsCN:
except Exception as e: except Exception as e:
import traceback import traceback
print(traceback.format_exc()) logger.warning(f"Crawler_NewsCN {traceback.format_exc()}")
# finally: finally:
# self.driver.quit() self.driver.quit()
logger.warning( logger.warning(
f"【Crawler_NewsCN process completed】, keyword={keyword}, result len={len(result)}" f"【Crawler_NewsCN process completed】, keyword={keyword}, result len={len(result)}"

20
main.py
View File

@ -1,6 +1,7 @@
from engines.crawler_google_search import CrawlerGoogleSearch from engines.crawler_google_search import CrawlerGoogleSearch
from engines.crawler_newscn_search import Crawler_NewsCN from engines.crawler_newscn_search import Crawler_NewsCN
from engines.info_extractor import InfoExtractor from engines.info_extractor import InfoExtractor
from engines.crawler_bbc_search import Crawler_BBCSearch
from utils.logger import logger from utils.logger import logger
import os, sys, time, traceback, json import os, sys, time, traceback, json
@ -10,29 +11,26 @@ sys.path.append(relative_path)
from queue_processor import QueueProcessor from queue_processor import QueueProcessor
infoExtractor = InfoExtractor()
crawlerGoogleSearch = CrawlerGoogleSearch()
crawler_NewsCN = Crawler_NewsCN()
class Main(QueueProcessor): class Main(QueueProcessor):
def processor_handle(self, input): def processor_handle(self, input):
print("input:", input) # {'keyword': '林郑月娥' } print("input:", input)
currentEngineId = input["currentEngineId"] currentEngineId = input["currentEngineId"]
inputData = json.loads(input["inputData"]) inputData = json.loads(input["inputData"])
match currentEngineId: match currentEngineId:
case 3000: # InfoExtractor 实体信息提取 case 3000: # InfoExtractor 实体信息提取
return infoExtractor.process(inputData) return InfoExtractor().process(inputData)
case 9000: # crawler_google_search google在线 case 9000: # crawler_google_search google在线
return crawlerGoogleSearch.process(inputData) return CrawlerGoogleSearch().process(inputData)
case 10000: # crawler_bbc_search bbc在线 case 10000: # crawler_bbc_search bbc在线
print(2) return Crawler_BBCSearch().process(inputData)
case 11000: # crawler_wikipedia case 11000: # crawler_wikipedia
print(3) print(3)
case 12000: # crawler_webb_site case 12000: # crawler_webb_site
print(4) print(4)
case 13000: # crawler_NewsCN 新华网英文站在线搜索 case 13000: # crawler_NewsCN 新华网英文站在线搜索
return crawler_NewsCN.process(inputData) # crawler_NewsCN = Crawler_NewsCN()
return Crawler_NewsCN().process(inputData)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -17,8 +17,11 @@ class submit_test:
# # 创建一个测试队列9000 / GoogleSearch # # 创建一个测试队列9000 / GoogleSearch
# submit_test.submit(9000, {"keyword": keyword}) # submit_test.submit(9000, {"keyword": keyword})
# # 创建一个测试队列10000 / BBCSearch # # # 创建一个测试队列10000 / BBCSearch
# submit_test.submit(10000, {"keyword": keyword}) submit_test.submit(10000, {"keyword": "习近平"})
submit_test.submit(10000, {"keyword": "US election"})
# submit_test.submit(10000, {"keyword": "US election"})
# submit_test.submit(10000, {"keyword": "US election"})
# # 创建一个测试队列11000 / Wikipedia # # 创建一个测试队列11000 / Wikipedia
# submit_test.submit(11000, {"keyword": keyword}) # submit_test.submit(11000, {"keyword": keyword})