1
0
Fork 0
CrawlerEngines/engines/crawler_wikipedia.py

50 lines
1.6 KiB
Python

import wikipedia
import wikipediaapi
from utils.logger import logger
MAX_SEARCH_RESULT = 1
NUMBER_CHARS_RETURN = 10000
class Crawler_Wikipedia:
def __search_topic_and_save(self, keyword, wiki_wiki):
try:
page = wikipedia.page(keyword)
except wikipedia.exceptions.DisambiguationError as e:
queries='\n'.join(str(e).split('\n')[1:])
queries=queries.split('\n')
page_py = wiki_wiki.page(queries[0])
return page_py.summary[0:NUMBER_CHARS_RETURN]
def search_and_save(self, keyword, lang):
wikipedia.set_lang(lang)
wiki_wiki = wikipediaapi.Wikipedia('icon (icon@iconsz.com)', language=lang)
content_list = [self.__search_topic_and_save(keyword, wiki_wiki)]
logger.warning(f"Crawler_Wikipedia topic:{topic}")
return content_list
# print(content_list)
def process(self, inputData):
logger.warning("Crawler_Wikipedia / inputData", inputData)
keyword = inputData["keyword"]
# lang: en for English, zh for Chinese
lang = inputData["lang"]
result = []
try:
result = self.search_and_save(keyword, lang)
# print("result", result)
except Exception:
import traceback
logger.warning(f"Crawler_NewsCN {traceback.format_exc()}")
logger.warning(
f"Crawler_Wikipedia process completed】, keyword={keyword}, result len={len(result)}"
)
return result
if __name__ == "__main__":
crwaler = Crawler_Wikipedia()
crwaler.search_and_save('prc', 'en')