1
0
Fork 0

更新维基爬虫,更新包依赖

main
songtao 2024-10-28 14:14:05 +08:00
parent 57a595ad09
commit 17492cbb5e
3 changed files with 129 additions and 1 deletions

3
.gitignore vendored
View File

@ -1,2 +1,3 @@
/logs
__pycache__
__pycache__
/venv

View File

@ -0,0 +1,49 @@
import wikipedia
import wikipediaapi
from utils.logger import logger
MAX_SEARCH_RESULT = 1
NUMBER_CHARS_RETURN = 10000
class Crawler_Wikipedia:
def __search_topic_and_save(self, keyword, wiki_wiki):
try:
page = wikipedia.page(keyword)
except wikipedia.exceptions.DisambiguationError as e:
queries='\n'.join(str(e).split('\n')[1:])
queries=queries.split('\n')
page_py = wiki_wiki.page(queries[0])
return page_py.summary[0:NUMBER_CHARS_RETURN]
def search_and_save(self, keyword, lang):
wikipedia.set_lang(lang)
wiki_wiki = wikipediaapi.Wikipedia('icon (icon@iconsz.com)', language=lang)
content_list = [self.__search_topic_and_save(keyword, wiki_wiki)]
logger.warning(f"Crawler_Wikipedia topic:{topic}")
return content_list
# print(content_list)
def process(self, inputData):
logger.warning("Crawler_Wikipedia / inputData", inputData)
keyword = inputData["keyword"]
# lang: en for English, zh for Chinese
lang = inputData["lang"]
result = []
try:
result = self.search_and_save(keyword, lang)
# print("result", result)
except Exception:
import traceback
logger.warning(f"Crawler_NewsCN {traceback.format_exc()}")
logger.warning(
f"Crawler_Wikipedia process completed】, keyword={keyword}, result len={len(result)}"
)
return result
if __name__ == "__main__":
crwaler = Crawler_Wikipedia()
crwaler.search_and_save('prc', 'en')

78
requirements.txt 100644
View File

@ -0,0 +1,78 @@
asttokens==2.4.1
attrs==24.2.0
backcall==0.2.0
beautifulsoup4==4.12.3
certifi==2024.8.30
charset-normalizer==3.3.2
comm==0.2.2
contourpy==1.1.1
cycler==0.12.1
debugpy==1.8.6
decorator==5.1.1
exceptiongroup==1.2.2
executing==2.1.0
fonttools==4.54.1
h11==0.14.0
idna==3.10
importlib-metadata==8.5.0
importlib-resources==6.4.5
ipykernel==6.29.5
ipython==8.12.3
jedi==0.19.1
jupyter-client==8.6.3
jupyter-core==5.7.2
kiwisolver==1.4.7
matplotlib==3.7.5
matplotlib-inline==0.1.7
mock==5.1.0
nest-asyncio==1.6.0
numpy==1.24.4
outcome==1.3.0.post0
packaging==24.1
pandas==2.0.3
parso==0.8.4
pexpect==4.9.0
pickleshare==0.7.5
pillow==10.4.0
platformdirs==4.3.6
prompt-toolkit==3.0.48
psutil==6.0.0
ptyprocess==0.7.0
pure-eval==0.2.3
pygments==2.18.0
pyparsing==3.1.4
PySocks==1.7.1
python-dateutil==2.9.0.post0
pytz==2024.2
pyzmq==26.2.0
requests==2.32.3
scipy==1.10.1
selenium==4.25.0
six==1.16.0
sniffio==1.3.1
sortedcontainers==2.4.0
soupsieve==2.6
stack-data==0.6.3
subword-nmt==0.3.8
tornado==6.4.1
tqdm==4.66.5
traitlets==5.14.3
trio==0.26.2
trio-websocket==0.11.1
typing-extensions==4.12.2
tzdata==2024.2
urllib3==2.2.3
wcwidth==0.2.13
websocket-client==1.8.0
wsproto==1.2.0
zipp==3.20.2
beautifulsoup4==4.12.3
certifi==2024.8.30
charset-normalizer==3.4.0
idna==3.10
requests==2.32.3
setuptools==75.2.0
soupsieve==2.6
urllib3==2.2.3
wikipedia==1.4.0
Wikipedia-API==0.7.1