forked from iCON/CrawlerEngines
新华网英文站在线搜索
parent
e0ae84581e
commit
294cf0eba1
|
|
@ -8,36 +8,71 @@ import json
|
|||
import time
|
||||
import re
|
||||
import os
|
||||
from utils.logger import logger
|
||||
|
||||
BASE_URL = "https://english.news.cn"
|
||||
XINHUA_OVERSEAS_REGIONS = ["asiapacific", "europe", "africa", "northamerica"]
|
||||
# from utils.logger import logger
|
||||
|
||||
BASE_URL_EN = "https://english.news.cn"
|
||||
BASE_URL_CN = "https://so.news.cn/"
|
||||
XINHUA_OVERSEAS_REGIONS = [
|
||||
"asiapacific",
|
||||
"europe",
|
||||
"africa",
|
||||
"northamerica",
|
||||
"german",
|
||||
"20241016",
|
||||
]
|
||||
|
||||
# The maximum number of blog posts to crawl
|
||||
# C:\Users\MillionZhang\.cache\selenium\chromedriver\win64
|
||||
MAX_BLOG_LIMIT = 15
|
||||
MAX_BLOG_LIMIT = 10
|
||||
|
||||
|
||||
class Crawler_NewsCN:
|
||||
def __init__(self, driver: webdriver) -> None:
|
||||
self.driver = driver
|
||||
def __init__(self) -> None:
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
self.driver = webdriver.Chrome(options=chrome_options)
|
||||
self.selected_blogs = {"titles": [], "urls": []}
|
||||
|
||||
def __search_topic(self, topic: str):
|
||||
self.driver.get(BASE_URL)
|
||||
WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua"))
|
||||
def __search_topic(self, topic: str, lang):
|
||||
if lang == "en":
|
||||
self.driver.get(BASE_URL_EN)
|
||||
WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua"))
|
||||
|
||||
# input topic to be searched in search bar
|
||||
search_bar = self.driver.find_element(By.CLASS_NAME, "search-input")
|
||||
search_bar.send_keys(topic)
|
||||
# input topic to be searched in search bar
|
||||
search_bar = self.driver.find_element(By.CLASS_NAME, "search-input")
|
||||
search_bar.send_keys(topic)
|
||||
|
||||
# click search button
|
||||
search_submit_button = self.driver.find_element(By.ID, "searchSubmit")
|
||||
search_submit_button.click()
|
||||
# click search button
|
||||
search_submit_button = self.driver.find_element(By.ID, "searchSubmit")
|
||||
search_submit_button.click()
|
||||
|
||||
# close home window and switch to new window
|
||||
self.driver.close()
|
||||
self.driver.switch_to.window(self.driver.window_handles[0])
|
||||
WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua research"))
|
||||
# close home window and switch to new window
|
||||
self.driver.close()
|
||||
self.driver.switch_to.window(self.driver.window_handles[0])
|
||||
WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua research"))
|
||||
else:
|
||||
if lang == "cn":
|
||||
self.driver.get(BASE_URL_CN)
|
||||
WebDriverWait(self.driver, 10).until(EC.title_contains("新华搜索"))
|
||||
|
||||
# input topic to be searched in search bar
|
||||
search_bar = self.driver.find_element(By.CLASS_NAME, "input")
|
||||
search_bar.send_keys(topic)
|
||||
|
||||
# click search button
|
||||
# search_submit_button = self.driver.find_element(By.ID, "searchSubmit")
|
||||
search_submit_button = self.driver.find_element(
|
||||
By.CLASS_NAME, "search-button"
|
||||
)
|
||||
search_submit_button.click()
|
||||
self.driver.switch_to.window(self.driver.window_handles[0])
|
||||
WebDriverWait(self.driver, 10).until(EC.title_contains("新华搜索"))
|
||||
|
||||
def __select_blog_in_search_page(self):
|
||||
raw_blogs = self.driver.find_element(By.CLASS_NAME, "content").get_attribute(
|
||||
|
|
@ -103,57 +138,91 @@ class Crawler_NewsCN:
|
|||
|
||||
return {"title": blog_title, "meta": blog_meta, "content": blog_content}
|
||||
|
||||
def __get_and_save_blog(self, url: str):
|
||||
# 当lang=cn时
|
||||
def __retrieve_cn_blog(self) -> dict:
|
||||
blog_title = (
|
||||
self.driver.find_element(By.CLASS_NAME, "head-line.clearfix")
|
||||
.find_element(By.CLASS_NAME, "title")
|
||||
.text
|
||||
)
|
||||
|
||||
blog_meta = (
|
||||
self.driver.find_element(By.CLASS_NAME, "header-cont.clearfix")
|
||||
.find_element(By.CLASS_NAME, "source")
|
||||
.text
|
||||
)
|
||||
|
||||
blog_content = self.driver.find_element(By.ID, "detailContent").text
|
||||
|
||||
return {"title": blog_title, "meta": blog_meta, "content": blog_content}
|
||||
|
||||
def __get_and_save_blog(self, url: str, lang):
|
||||
self.driver.get(url)
|
||||
WebDriverWait(self.driver, 10).until(EC.title_contains("Xinhua"))
|
||||
region_code = urllib.parse.urlparse(url).path.split("/")[1]
|
||||
if region_code in XINHUA_OVERSEAS_REGIONS:
|
||||
WebDriverWait(self.driver, 10).until(EC.title_contains("新华"))
|
||||
|
||||
if lang == "en":
|
||||
# region_code = urllib.parse.urlparse(url).path.split("/")[1]
|
||||
# if region_code in XINHUA_OVERSEAS_REGIONS:
|
||||
# blog = self.__retrieve_overseas_blog()
|
||||
# else:
|
||||
# blog = self.__retrieve_china_blog()
|
||||
|
||||
blog = self.__retrieve_overseas_blog()
|
||||
blog_title = blog.get("title", "")
|
||||
else:
|
||||
blog = self.__retrieve_china_blog()
|
||||
blog_title = blog.get("title", "")
|
||||
print(blog_title)
|
||||
if lang == "cn":
|
||||
blog = self.__retrieve_cn_blog()
|
||||
blog_title = blog.get("title", "")
|
||||
# print(blog_title)
|
||||
|
||||
# Remove invalid char in file_path_name on Windows
|
||||
invalid_chars_pattern = r'[\\/:*?"<>|]'
|
||||
blog_title = re.sub(invalid_chars_pattern, "", blog_title)
|
||||
# invalid_chars_pattern = r'[\\/:*?"<>|]'
|
||||
# blog_title = re.sub(invalid_chars_pattern, "", blog_title)
|
||||
|
||||
file = open(os.path.join("", "Xinhua_{blog_title}.json"), "w")
|
||||
json.dump(blog, file)
|
||||
file.close()
|
||||
time.sleep(2)
|
||||
# file = open(os.path.join("", f"Xinhua_{blog_title}.json"), "w")
|
||||
# json.dump(blog, file)
|
||||
# file.close()
|
||||
# time.sleep(2)
|
||||
return blog
|
||||
|
||||
def search_and_save(self, topic: str):
|
||||
self.__search_topic(topic)
|
||||
def search_and_save(self, topic: str, lang):
|
||||
content_list = []
|
||||
self.__search_topic(topic, lang)
|
||||
time.sleep(1)
|
||||
self.__select_blog_in_search_page()
|
||||
print(self.selected_blogs)
|
||||
# print(self.selected_blogs)
|
||||
url_list = self.selected_blogs.get("urls", [])
|
||||
|
||||
idx = 1
|
||||
for url in url_list:
|
||||
self.__get_and_save_blog(url)
|
||||
logger.warning(f"{idx}/{len(url_list)} url:{url}")
|
||||
content = self.__get_and_save_blog(url, lang)
|
||||
content_list.append(content)
|
||||
idx += 1
|
||||
return content_list
|
||||
|
||||
def direct_save(self, url: str):
|
||||
self.__get_and_save_blog(url)
|
||||
|
||||
def test(self):
|
||||
def process(self, inputData):
|
||||
# return {"full_name": "xxx", "date_of_birth": "1956-01-01"}
|
||||
|
||||
print("Crawler_NewsCN / inputData", inputData)
|
||||
keyword = inputData["keyword"]
|
||||
lang = inputData["lang"]
|
||||
|
||||
result = []
|
||||
try:
|
||||
# self.search_and_save("china")
|
||||
self.search_and_save("xi jinping")
|
||||
# self.direct_save("<an url>")
|
||||
result = self.search_and_save(keyword, lang)
|
||||
# print("result", result)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
print(traceback.format_exc())
|
||||
finally:
|
||||
self.driver.quit()
|
||||
# finally:
|
||||
# self.driver.quit()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--disable-dev-shm-usage")
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
crawler = Crawler_NewsCN(driver)
|
||||
crawler.test()
|
||||
logger.warning(
|
||||
f"【Crawler_NewsCN process completed】, keyword={keyword}, result len={len(result)}"
|
||||
)
|
||||
return result
|
||||
|
|
|
|||
|
|
@ -0,0 +1,3 @@
|
|||
class InfoExtractor:
|
||||
def process(self, inputData):
|
||||
print(inputData)
|
||||
21
main.py
21
main.py
|
|
@ -1,4 +1,6 @@
|
|||
from engines.crawler_google_search import CrawlerGoogleSearch
|
||||
from engines.crawler_newscn_search import Crawler_NewsCN
|
||||
from engines.info_extractor import InfoExtractor
|
||||
from utils.logger import logger
|
||||
import os, sys, time, traceback, json
|
||||
|
||||
|
|
@ -8,26 +10,29 @@ sys.path.append(relative_path)
|
|||
|
||||
from queue_processor import QueueProcessor
|
||||
|
||||
infoExtractor = InfoExtractor()
|
||||
crawlerGoogleSearch = CrawlerGoogleSearch()
|
||||
crawler_NewsCN = Crawler_NewsCN()
|
||||
|
||||
|
||||
class Main(QueueProcessor):
|
||||
def processor_handle(self, input):
|
||||
print("input:", input) # {'keyword': '林郑月娥' }
|
||||
|
||||
currentEngineId = input["currentEngineId"]
|
||||
inputData = json.loads(input["inputData"])
|
||||
# keyword = inputData["keyword"]
|
||||
# print("keyword:", keyword)
|
||||
match currentEngineId:
|
||||
case 3000: # crawler_bbc_search
|
||||
return CrawlerGoogleSearch.process(inputData)
|
||||
case 9000: # crawler_bbc_search
|
||||
return CrawlerGoogleSearch.process(inputData)
|
||||
case 10000: # crawler_bbc_search
|
||||
case 3000: # InfoExtractor 实体信息提取
|
||||
return infoExtractor.process(inputData)
|
||||
case 9000: # crawler_google_search google在线所搜
|
||||
return crawlerGoogleSearch.process(inputData)
|
||||
case 10000: # crawler_bbc_search bbc在线所搜
|
||||
print(2)
|
||||
case 11000: # crawler_wikipedia
|
||||
print(3)
|
||||
case 12000: # crawler_webb_site
|
||||
print(4)
|
||||
case 13000: # crawler_NewsCN 新华网英文站在线搜索
|
||||
return crawler_NewsCN.process(inputData)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -8,20 +8,24 @@ from queue_client import QueueClient
|
|||
|
||||
|
||||
class submit_test:
|
||||
def submit(code, keyword):
|
||||
def submit(code, data):
|
||||
client = QueueClient()
|
||||
returnData = client.call(code, {"keyword": keyword})
|
||||
returnData = client.call(code, data)
|
||||
print(returnData)
|
||||
|
||||
|
||||
# 创建一个测试队列:9000 / GoogleSearch
|
||||
submit_test.submit(9000, "林郑月娥")
|
||||
# # 创建一个测试队列:9000 / GoogleSearch
|
||||
# submit_test.submit(9000, {"keyword": keyword})
|
||||
|
||||
# 创建一个测试队列:10000 / BBCSearch
|
||||
submit_test.submit(10000, "林郑月娥")
|
||||
# # 创建一个测试队列:10000 / BBCSearch
|
||||
# submit_test.submit(10000, {"keyword": keyword})
|
||||
|
||||
# 创建一个测试队列:11000 / Wikipedia
|
||||
submit_test.submit(11000, "林郑月娥")
|
||||
# # 创建一个测试队列:11000 / Wikipedia
|
||||
# submit_test.submit(11000, {"keyword": keyword})
|
||||
|
||||
# 创建一个测试队列:12000 / WebbSite
|
||||
submit_test.submit(12000, "林郑月娥")
|
||||
# # 创建一个测试队列:12000 / WebbSite
|
||||
# submit_test.submit(12000, {"keyword": keyword})
|
||||
|
||||
# 创建一个测试队列:13000 / Crawler_NewsCN 新华网英文站在线搜索
|
||||
submit_test.submit(13000, {"keyword": "china", "lang": "en"})
|
||||
submit_test.submit(13000, {"keyword": "中国", "lang": "cn"})
|
||||
|
|
|
|||
Loading…
Reference in New Issue