forked from iCON/CrawlerEngines
50 lines
1.6 KiB
Python
50 lines
1.6 KiB
Python
import wikipedia
|
|
import wikipediaapi
|
|
from utils.logger import logger
|
|
|
|
MAX_SEARCH_RESULT = 1
|
|
NUMBER_CHARS_RETURN = 10000
|
|
|
|
class Crawler_Wikipedia:
|
|
|
|
def __search_topic_and_save(self, keyword, wiki_wiki):
|
|
try:
|
|
page = wikipedia.page(keyword)
|
|
except wikipedia.exceptions.DisambiguationError as e:
|
|
queries='\n'.join(str(e).split('\n')[1:])
|
|
queries=queries.split('\n')
|
|
page_py = wiki_wiki.page(queries[0])
|
|
|
|
return page_py.summary[0:NUMBER_CHARS_RETURN]
|
|
|
|
def search_and_save(self, keyword, lang):
|
|
wikipedia.set_lang(lang)
|
|
wiki_wiki = wikipediaapi.Wikipedia('icon (icon@iconsz.com)', language=lang)
|
|
content_list = [self.__search_topic_and_save(keyword, wiki_wiki)]
|
|
logger.warning(f"Crawler_Wikipedia topic:{topic}")
|
|
return content_list
|
|
# print(content_list)
|
|
|
|
def process(self, inputData):
|
|
logger.warning("Crawler_Wikipedia / inputData", inputData)
|
|
keyword = inputData["keyword"]
|
|
# lang: en for English, zh for Chinese
|
|
lang = inputData["lang"]
|
|
|
|
result = []
|
|
try:
|
|
result = self.search_and_save(keyword, lang)
|
|
# print("result", result)
|
|
except Exception:
|
|
import traceback
|
|
logger.warning(f"Crawler_Wikipedia {traceback.format_exc()}")
|
|
|
|
logger.warning(
|
|
f"Crawler_Wikipedia process completed】, keyword={keyword}, result len={len(result)}"
|
|
)
|
|
return result
|
|
|
|
if __name__ == "__main__":
|
|
crwaler = Crawler_Wikipedia()
|
|
crwaler.search_and_save('prc', 'en')
|