diff --git a/.gitignore b/.gitignore index 8ef114b..56d6c13 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ /logs -__pycache__ \ No newline at end of file +__pycache__ +/venv \ No newline at end of file diff --git a/engines/crawler_wikipedia.py b/engines/crawler_wikipedia.py index e69de29..5688191 100644 --- a/engines/crawler_wikipedia.py +++ b/engines/crawler_wikipedia.py @@ -0,0 +1,49 @@ +import wikipedia +import wikipediaapi +from utils.logger import logger + +MAX_SEARCH_RESULT = 1 +NUMBER_CHARS_RETURN = 10000 + +class Crawler_Wikipedia: + + def __search_topic_and_save(self, keyword, wiki_wiki): + try: + page = wikipedia.page(keyword) + except wikipedia.exceptions.DisambiguationError as e: + queries='\n'.join(str(e).split('\n')[1:]) + queries=queries.split('\n') + page_py = wiki_wiki.page(queries[0]) + + return page_py.summary[0:NUMBER_CHARS_RETURN] + + def search_and_save(self, keyword, lang): + wikipedia.set_lang(lang) + wiki_wiki = wikipediaapi.Wikipedia('icon (icon@iconsz.com)', language=lang) + content_list = [self.__search_topic_and_save(keyword, wiki_wiki)] + logger.warning(f"Crawler_Wikipedia topic:{topic}") + return content_list + # print(content_list) + + def process(self, inputData): + logger.warning("Crawler_Wikipedia / inputData", inputData) + keyword = inputData["keyword"] + # lang: en for English, zh for Chinese + lang = inputData["lang"] + + result = [] + try: + result = self.search_and_save(keyword, lang) + # print("result", result) + except Exception: + import traceback + logger.warning(f"Crawler_NewsCN {traceback.format_exc()}") + + logger.warning( + f"Crawler_Wikipedia process completed】, keyword={keyword}, result len={len(result)}" + ) + return result + +if __name__ == "__main__": + crwaler = Crawler_Wikipedia() + crwaler.search_and_save('prc', 'en') diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..59cd532 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,78 @@ +asttokens==2.4.1 +attrs==24.2.0 +backcall==0.2.0 +beautifulsoup4==4.12.3 +certifi==2024.8.30 +charset-normalizer==3.3.2 +comm==0.2.2 +contourpy==1.1.1 +cycler==0.12.1 +debugpy==1.8.6 +decorator==5.1.1 +exceptiongroup==1.2.2 +executing==2.1.0 +fonttools==4.54.1 +h11==0.14.0 +idna==3.10 +importlib-metadata==8.5.0 +importlib-resources==6.4.5 +ipykernel==6.29.5 +ipython==8.12.3 +jedi==0.19.1 +jupyter-client==8.6.3 +jupyter-core==5.7.2 +kiwisolver==1.4.7 +matplotlib==3.7.5 +matplotlib-inline==0.1.7 +mock==5.1.0 +nest-asyncio==1.6.0 +numpy==1.24.4 +outcome==1.3.0.post0 +packaging==24.1 +pandas==2.0.3 +parso==0.8.4 +pexpect==4.9.0 +pickleshare==0.7.5 +pillow==10.4.0 +platformdirs==4.3.6 +prompt-toolkit==3.0.48 +psutil==6.0.0 +ptyprocess==0.7.0 +pure-eval==0.2.3 +pygments==2.18.0 +pyparsing==3.1.4 +PySocks==1.7.1 +python-dateutil==2.9.0.post0 +pytz==2024.2 +pyzmq==26.2.0 +requests==2.32.3 +scipy==1.10.1 +selenium==4.25.0 +six==1.16.0 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +stack-data==0.6.3 +subword-nmt==0.3.8 +tornado==6.4.1 +tqdm==4.66.5 +traitlets==5.14.3 +trio==0.26.2 +trio-websocket==0.11.1 +typing-extensions==4.12.2 +tzdata==2024.2 +urllib3==2.2.3 +wcwidth==0.2.13 +websocket-client==1.8.0 +wsproto==1.2.0 +zipp==3.20.2 +beautifulsoup4==4.12.3 +certifi==2024.8.30 +charset-normalizer==3.4.0 +idna==3.10 +requests==2.32.3 +setuptools==75.2.0 +soupsieve==2.6 +urllib3==2.2.3 +wikipedia==1.4.0 +Wikipedia-API==0.7.1