forked from iCON/CrawlerEngines
bbc news
parent
294cf0eba1
commit
36353926de
|
|
@ -7,17 +7,28 @@ import urllib.parse
|
||||||
import json
|
import json
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
from utils.logger import logger
|
||||||
|
|
||||||
BASE_URL = "https://www.bbc.com"
|
BASE_URL = "https://www.bbc.com"
|
||||||
TARGET_URL = urllib.parse.urljoin(BASE_URL, "/news")
|
TARGET_URL = urllib.parse.urljoin(BASE_URL, "/news")
|
||||||
|
|
||||||
# The maximum number of blog posts to crawl
|
# The maximum number of blog posts to crawl
|
||||||
MAX_BLOG_LIMIT = 15
|
MAX_BLOG_LIMIT = 8
|
||||||
|
|
||||||
|
|
||||||
class Crawler_BBC:
|
# bbc在线搜索
|
||||||
def __init__(self, driver: webdriver) -> None:
|
class Crawler_BBCSearch:
|
||||||
self.driver = driver
|
def __init__(self) -> None:
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
|
||||||
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
|
chrome_options.add_argument("--ignore-certificate-errors")
|
||||||
|
chrome_options.add_argument("--ignore-ssl-errors=yes")
|
||||||
|
chrome_options.add_argument("--headless")
|
||||||
|
chrome_options.add_argument("--no-sandbox")
|
||||||
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||||
|
self.driver = webdriver.Chrome(options=chrome_options)
|
||||||
self.selected_blogs = {"titles": [], "urls": []}
|
self.selected_blogs = {"titles": [], "urls": []}
|
||||||
|
|
||||||
def __search_topic(self, topic: str):
|
def __search_topic(self, topic: str):
|
||||||
|
|
@ -26,14 +37,17 @@ class Crawler_BBC:
|
||||||
|
|
||||||
# open up side bar to show search bar
|
# open up side bar to show search bar
|
||||||
self.driver.find_element(By.CLASS_NAME, "sc-8a068d35-3.kvafkS").click()
|
self.driver.find_element(By.CLASS_NAME, "sc-8a068d35-3.kvafkS").click()
|
||||||
|
time.sleep(1)
|
||||||
# input topic to be searched in search bar
|
# input topic to be searched in search bar
|
||||||
search_bar = self.driver.find_element(By.CLASS_NAME, "sc-e1a87ea7-1.iARAvt")
|
search_bar = self.driver.find_element(By.CLASS_NAME, "sc-e1a87ea7-1.iARAvt")
|
||||||
search_bar.send_keys(topic)
|
search_bar.send_keys(topic)
|
||||||
|
|
||||||
# click search button
|
# click search button
|
||||||
|
# search_submit_button = self.driver.find_element(
|
||||||
|
# By.CLASS_NAME, "sc-f6c53a81-2.sc-f6c53a81-3.dyeOnJ.dQfGZm"
|
||||||
|
# )
|
||||||
search_submit_button = self.driver.find_element(
|
search_submit_button = self.driver.find_element(
|
||||||
By.CLASS_NAME, "sc-f6c53a81-2.sc-f6c53a81-3.dyeOnJ.dQfGZm"
|
By.CSS_SELECTOR, '[data-testid="search-input-search-button"]'
|
||||||
)
|
)
|
||||||
search_submit_button.click()
|
search_submit_button.click()
|
||||||
|
|
||||||
|
|
@ -71,8 +85,7 @@ class Crawler_BBC:
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
continue
|
||||||
# skip blogs that are not news
|
# skip blogs that are not news
|
||||||
if not "news" in url:
|
if "news" in url and "/videos/" not in url:
|
||||||
continue
|
|
||||||
self.selected_blogs["titles"].append(title)
|
self.selected_blogs["titles"].append(title)
|
||||||
# bbc's href links only contains path
|
# bbc's href links only contains path
|
||||||
if not urllib.parse.urlparse(url).netloc:
|
if not urllib.parse.urlparse(url).netloc:
|
||||||
|
|
@ -81,6 +94,9 @@ class Crawler_BBC:
|
||||||
|
|
||||||
if len(self.selected_blogs["urls"]) < MAX_BLOG_LIMIT:
|
if len(self.selected_blogs["urls"]) < MAX_BLOG_LIMIT:
|
||||||
# go to next page
|
# go to next page
|
||||||
|
if self.driver.find_elements(
|
||||||
|
By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]'
|
||||||
|
):
|
||||||
next_page_btn = self.driver.find_element(
|
next_page_btn = self.driver.find_element(
|
||||||
By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]'
|
By.CSS_SELECTOR, 'button[data-testid="pagination-next-button"]'
|
||||||
)
|
)
|
||||||
|
|
@ -108,14 +124,22 @@ class Crawler_BBC:
|
||||||
"div", attrs={"data-component": "headline-block"}, recursive=True
|
"div", attrs={"data-component": "headline-block"}, recursive=True
|
||||||
).text
|
).text
|
||||||
blog_time = raw_blog.find("time", recursive=True).text
|
blog_time = raw_blog.find("time", recursive=True).text
|
||||||
blog_contributor = raw_blog.find(
|
|
||||||
"div", attrs={"data-testid": "byline-new-contributors"}, recursive=True
|
blog_contributor_el = raw_blog.find(
|
||||||
).text
|
"div",
|
||||||
blog_meta = {"time": blog_time, "author": blog_contributor}
|
attrs={"data-testid": "byline-new-contributors"},
|
||||||
|
recursive=True,
|
||||||
|
)
|
||||||
|
blog_contributor = ""
|
||||||
|
if blog_contributor_el != None:
|
||||||
|
blog_contributor = blog_contributor_el.text
|
||||||
|
|
||||||
blog_content_blocks = raw_blog.find_all(
|
blog_content_blocks = raw_blog.find_all(
|
||||||
"div", attrs={"data-component": "text-block"}, recursive=True
|
"div", attrs={"data-component": "text-block"}, recursive=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
blog_meta = {"time": blog_time, "author": blog_contributor}
|
||||||
|
|
||||||
blog_content = ""
|
blog_content = ""
|
||||||
|
|
||||||
for block in blog_content_blocks:
|
for block in blog_content_blocks:
|
||||||
|
|
@ -125,9 +149,14 @@ class Crawler_BBC:
|
||||||
|
|
||||||
def __get_and_save_blog(self, url: str):
|
def __get_and_save_blog(self, url: str):
|
||||||
self.driver.get(url)
|
self.driver.get(url)
|
||||||
time.sleep(3)
|
# WebDriverWait(self.driver, 10).until(EC.title_contains("新华"))
|
||||||
|
div = WebDriverWait(self.driver, 10).until(
|
||||||
|
EC.presence_of_element_located((By.ID, "main-content"))
|
||||||
|
)
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
blog = self.__retrieve_blog()
|
blog = self.__retrieve_blog()
|
||||||
|
return blog
|
||||||
|
|
||||||
blog_title = blog.get("title", "")
|
blog_title = blog.get("title", "")
|
||||||
print(blog_title)
|
print(blog_title)
|
||||||
|
|
@ -142,34 +171,42 @@ class Crawler_BBC:
|
||||||
time.sleep(2)
|
time.sleep(2)
|
||||||
|
|
||||||
def search_and_save(self, topic: str):
|
def search_and_save(self, topic: str):
|
||||||
|
content_list = []
|
||||||
|
logger.warning(f"Crawler_BBCSearch start search topic {topic}")
|
||||||
self.__search_topic(topic)
|
self.__search_topic(topic)
|
||||||
|
time.sleep(1)
|
||||||
|
logger.warning("Crawler_BBCSearch start select blog in search page")
|
||||||
self.__select_blog_in_search_page()
|
self.__select_blog_in_search_page()
|
||||||
url_list = self.selected_blogs.get("urls", [])
|
url_list = self.selected_blogs.get("urls", [])
|
||||||
|
logger.warning(f"Crawler_BBCSearch url_list {str(url_list)}")
|
||||||
|
|
||||||
|
idx = 1
|
||||||
for url in url_list:
|
for url in url_list:
|
||||||
self.__get_and_save_blog(url)
|
logger.warning(f"Crawler_BBCSearch {idx}/{len(url_list)} url:{url}")
|
||||||
|
content = self.__get_and_save_blog(url)
|
||||||
|
content_list.append(content)
|
||||||
|
idx += 1
|
||||||
|
return content_list
|
||||||
|
|
||||||
def direct_save(self, url: str):
|
def direct_save(self, url: str):
|
||||||
self.__get_and_save_blog(url)
|
self.__get_and_save_blog(url)
|
||||||
|
|
||||||
def test(self):
|
def process(self, inputData):
|
||||||
|
logger.warning("Crawler_BBCSearch / inputData", inputData)
|
||||||
|
keyword = inputData["keyword"]
|
||||||
|
|
||||||
|
result = []
|
||||||
try:
|
try:
|
||||||
self.search_and_save("US election")
|
result = self.search_and_save(keyword)
|
||||||
# self.direct_save("https://www.bbc.com/news/articles/c2edewgv2kpo")
|
# print("result", result)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
print(traceback.format_exc())
|
logger.warning(f"Crawler_BBCSearch {traceback.format_exc()}")
|
||||||
finally:
|
finally:
|
||||||
self.driver.quit()
|
self.driver.quit()
|
||||||
|
|
||||||
|
logger.warning(
|
||||||
if __name__ == "__main__":
|
f"Crawler_BBCSearch process completed】, keyword={keyword}, result len={len(result)}"
|
||||||
from selenium.webdriver.chrome.options import Options
|
)
|
||||||
|
return result
|
||||||
chrome_options = Options()
|
|
||||||
chrome_options.add_argument("--headless")
|
|
||||||
chrome_options.add_argument("--no-sandbox")
|
|
||||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
|
||||||
driver = webdriver.Chrome(options=chrome_options)
|
|
||||||
crawler = Crawler_BBC(driver)
|
|
||||||
crawler.test()
|
|
||||||
|
|
|
||||||
|
|
@ -14,25 +14,22 @@ from utils.logger import logger
|
||||||
|
|
||||||
BASE_URL_EN = "https://english.news.cn"
|
BASE_URL_EN = "https://english.news.cn"
|
||||||
BASE_URL_CN = "https://so.news.cn/"
|
BASE_URL_CN = "https://so.news.cn/"
|
||||||
XINHUA_OVERSEAS_REGIONS = [
|
XINHUA_OVERSEAS_REGIONS = ["asiapacific", "europe", "africa", "northamerica", "german"]
|
||||||
"asiapacific",
|
|
||||||
"europe",
|
|
||||||
"africa",
|
|
||||||
"northamerica",
|
|
||||||
"german",
|
|
||||||
"20241016",
|
|
||||||
]
|
|
||||||
|
|
||||||
# The maximum number of blog posts to crawl
|
# The maximum number of blog posts to crawl
|
||||||
# C:\Users\MillionZhang\.cache\selenium\chromedriver\win64
|
# C:\Users\MillionZhang\.cache\selenium\chromedriver\win64
|
||||||
MAX_BLOG_LIMIT = 10
|
MAX_BLOG_LIMIT = 8
|
||||||
|
|
||||||
|
|
||||||
|
# 新华网英文站在线搜索
|
||||||
class Crawler_NewsCN:
|
class Crawler_NewsCN:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
|
||||||
chrome_options = Options()
|
chrome_options = Options()
|
||||||
|
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
|
||||||
|
chrome_options.add_argument("--ignore-certificate-errors")
|
||||||
|
chrome_options.add_argument("--ignore-ssl-errors=yes")
|
||||||
chrome_options.add_argument("--headless")
|
chrome_options.add_argument("--headless")
|
||||||
chrome_options.add_argument("--no-sandbox")
|
chrome_options.add_argument("--no-sandbox")
|
||||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||||
|
|
@ -98,6 +95,7 @@ class Crawler_NewsCN:
|
||||||
|
|
||||||
if len(self.selected_blogs["urls"]) < MAX_BLOG_LIMIT:
|
if len(self.selected_blogs["urls"]) < MAX_BLOG_LIMIT:
|
||||||
# go to next page
|
# go to next page
|
||||||
|
if self.driver.find_elements(By.CLASS_NAME, "ant-pagination-next"):
|
||||||
next_page_btn = self.driver.find_element(
|
next_page_btn = self.driver.find_element(
|
||||||
By.CLASS_NAME, "ant-pagination-next"
|
By.CLASS_NAME, "ant-pagination-next"
|
||||||
)
|
)
|
||||||
|
|
@ -158,19 +156,26 @@ class Crawler_NewsCN:
|
||||||
|
|
||||||
def __get_and_save_blog(self, url: str, lang):
|
def __get_and_save_blog(self, url: str, lang):
|
||||||
self.driver.get(url)
|
self.driver.get(url)
|
||||||
WebDriverWait(self.driver, 10).until(EC.title_contains("新华"))
|
|
||||||
|
|
||||||
if lang == "en":
|
if lang == "en":
|
||||||
# region_code = urllib.parse.urlparse(url).path.split("/")[1]
|
WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua"))
|
||||||
# if region_code in XINHUA_OVERSEAS_REGIONS:
|
# time.sleep(1)
|
||||||
# blog = self.__retrieve_overseas_blog()
|
|
||||||
# else:
|
|
||||||
# blog = self.__retrieve_china_blog()
|
|
||||||
|
|
||||||
|
region_code = urllib.parse.urlparse(url).path.split("/")[1]
|
||||||
|
if region_code in XINHUA_OVERSEAS_REGIONS:
|
||||||
blog = self.__retrieve_overseas_blog()
|
blog = self.__retrieve_overseas_blog()
|
||||||
blog_title = blog.get("title", "")
|
else:
|
||||||
|
blog = self.__retrieve_china_blog()
|
||||||
|
|
||||||
|
# div = WebDriverWait(self.driver, 10).until(
|
||||||
|
# EC.presence_of_element_located((By.CLASS_NAME, "detailContent"))
|
||||||
|
# )
|
||||||
|
# blog = self.__retrieve_overseas_blog()
|
||||||
|
|
||||||
|
# blog_title = blog.get("title", "")
|
||||||
else:
|
else:
|
||||||
if lang == "cn":
|
if lang == "cn":
|
||||||
|
WebDriverWait(self.driver, 10).until(EC.title_contains("新华"))
|
||||||
blog = self.__retrieve_cn_blog()
|
blog = self.__retrieve_cn_blog()
|
||||||
blog_title = blog.get("title", "")
|
blog_title = blog.get("title", "")
|
||||||
# print(blog_title)
|
# print(blog_title)
|
||||||
|
|
@ -187,15 +192,18 @@ class Crawler_NewsCN:
|
||||||
|
|
||||||
def search_and_save(self, topic: str, lang):
|
def search_and_save(self, topic: str, lang):
|
||||||
content_list = []
|
content_list = []
|
||||||
|
logger.warning(f"Crawler_NewsCN start search topic {topic} {lang}")
|
||||||
self.__search_topic(topic, lang)
|
self.__search_topic(topic, lang)
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
logger.warning("Crawler_NewsCN start select blog in search page")
|
||||||
self.__select_blog_in_search_page()
|
self.__select_blog_in_search_page()
|
||||||
# print(self.selected_blogs)
|
# print(self.selected_blogs)
|
||||||
url_list = self.selected_blogs.get("urls", [])
|
url_list = self.selected_blogs.get("urls", [])
|
||||||
|
logger.warning(f"Crawler_NewsCN url_list {str(url_list)}")
|
||||||
|
|
||||||
idx = 1
|
idx = 1
|
||||||
for url in url_list:
|
for url in url_list:
|
||||||
logger.warning(f"{idx}/{len(url_list)} url:{url}")
|
logger.warning(f"Crawler_NewsCN {idx}/{len(url_list)} url:{url}")
|
||||||
content = self.__get_and_save_blog(url, lang)
|
content = self.__get_and_save_blog(url, lang)
|
||||||
content_list.append(content)
|
content_list.append(content)
|
||||||
idx += 1
|
idx += 1
|
||||||
|
|
@ -205,9 +213,7 @@ class Crawler_NewsCN:
|
||||||
self.__get_and_save_blog(url)
|
self.__get_and_save_blog(url)
|
||||||
|
|
||||||
def process(self, inputData):
|
def process(self, inputData):
|
||||||
# return {"full_name": "xxx", "date_of_birth": "1956-01-01"}
|
logger.warning("Crawler_NewsCN / inputData", inputData)
|
||||||
|
|
||||||
print("Crawler_NewsCN / inputData", inputData)
|
|
||||||
keyword = inputData["keyword"]
|
keyword = inputData["keyword"]
|
||||||
lang = inputData["lang"]
|
lang = inputData["lang"]
|
||||||
|
|
||||||
|
|
@ -218,9 +224,9 @@ class Crawler_NewsCN:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
print(traceback.format_exc())
|
logger.warning(f"Crawler_NewsCN {traceback.format_exc()}")
|
||||||
# finally:
|
finally:
|
||||||
# self.driver.quit()
|
self.driver.quit()
|
||||||
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"【Crawler_NewsCN process completed】, keyword={keyword}, result len={len(result)}"
|
f"【Crawler_NewsCN process completed】, keyword={keyword}, result len={len(result)}"
|
||||||
|
|
|
||||||
20
main.py
20
main.py
|
|
@ -1,6 +1,7 @@
|
||||||
from engines.crawler_google_search import CrawlerGoogleSearch
|
from engines.crawler_google_search import CrawlerGoogleSearch
|
||||||
from engines.crawler_newscn_search import Crawler_NewsCN
|
from engines.crawler_newscn_search import Crawler_NewsCN
|
||||||
from engines.info_extractor import InfoExtractor
|
from engines.info_extractor import InfoExtractor
|
||||||
|
from engines.crawler_bbc_search import Crawler_BBCSearch
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
import os, sys, time, traceback, json
|
import os, sys, time, traceback, json
|
||||||
|
|
||||||
|
|
@ -10,29 +11,26 @@ sys.path.append(relative_path)
|
||||||
|
|
||||||
from queue_processor import QueueProcessor
|
from queue_processor import QueueProcessor
|
||||||
|
|
||||||
infoExtractor = InfoExtractor()
|
|
||||||
crawlerGoogleSearch = CrawlerGoogleSearch()
|
|
||||||
crawler_NewsCN = Crawler_NewsCN()
|
|
||||||
|
|
||||||
|
|
||||||
class Main(QueueProcessor):
|
class Main(QueueProcessor):
|
||||||
def processor_handle(self, input):
|
def processor_handle(self, input):
|
||||||
print("input:", input) # {'keyword': '林郑月娥' }
|
print("input:", input)
|
||||||
currentEngineId = input["currentEngineId"]
|
currentEngineId = input["currentEngineId"]
|
||||||
inputData = json.loads(input["inputData"])
|
inputData = json.loads(input["inputData"])
|
||||||
match currentEngineId:
|
match currentEngineId:
|
||||||
case 3000: # InfoExtractor 实体信息提取
|
case 3000: # InfoExtractor 实体信息提取
|
||||||
return infoExtractor.process(inputData)
|
return InfoExtractor().process(inputData)
|
||||||
case 9000: # crawler_google_search google在线所搜
|
case 9000: # crawler_google_search google在线搜索
|
||||||
return crawlerGoogleSearch.process(inputData)
|
return CrawlerGoogleSearch().process(inputData)
|
||||||
case 10000: # crawler_bbc_search bbc在线所搜
|
case 10000: # crawler_bbc_search bbc在线搜索
|
||||||
print(2)
|
return Crawler_BBCSearch().process(inputData)
|
||||||
case 11000: # crawler_wikipedia
|
case 11000: # crawler_wikipedia
|
||||||
print(3)
|
print(3)
|
||||||
case 12000: # crawler_webb_site
|
case 12000: # crawler_webb_site
|
||||||
print(4)
|
print(4)
|
||||||
case 13000: # crawler_NewsCN 新华网英文站在线搜索
|
case 13000: # crawler_NewsCN 新华网英文站在线搜索
|
||||||
return crawler_NewsCN.process(inputData)
|
# crawler_NewsCN = Crawler_NewsCN()
|
||||||
|
return Crawler_NewsCN().process(inputData)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -17,8 +17,11 @@ class submit_test:
|
||||||
# # 创建一个测试队列:9000 / GoogleSearch
|
# # 创建一个测试队列:9000 / GoogleSearch
|
||||||
# submit_test.submit(9000, {"keyword": keyword})
|
# submit_test.submit(9000, {"keyword": keyword})
|
||||||
|
|
||||||
# # 创建一个测试队列:10000 / BBCSearch
|
# # # 创建一个测试队列:10000 / BBCSearch
|
||||||
# submit_test.submit(10000, {"keyword": keyword})
|
submit_test.submit(10000, {"keyword": "习近平"})
|
||||||
|
submit_test.submit(10000, {"keyword": "US election"})
|
||||||
|
# submit_test.submit(10000, {"keyword": "US election"})
|
||||||
|
# submit_test.submit(10000, {"keyword": "US election"})
|
||||||
|
|
||||||
# # 创建一个测试队列:11000 / Wikipedia
|
# # 创建一个测试队列:11000 / Wikipedia
|
||||||
# submit_test.submit(11000, {"keyword": keyword})
|
# submit_test.submit(11000, {"keyword": keyword})
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue