forked from iCON/CrawlerEngines
更新维基爬虫,更新包依赖
parent
57a595ad09
commit
17492cbb5e
|
|
@ -1,2 +1,3 @@
|
|||
/logs
|
||||
__pycache__
|
||||
__pycache__
|
||||
/venv
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
import wikipedia
|
||||
import wikipediaapi
|
||||
from utils.logger import logger
|
||||
|
||||
MAX_SEARCH_RESULT = 1
|
||||
NUMBER_CHARS_RETURN = 10000
|
||||
|
||||
class Crawler_Wikipedia:
|
||||
|
||||
def __search_topic_and_save(self, keyword, wiki_wiki):
|
||||
try:
|
||||
page = wikipedia.page(keyword)
|
||||
except wikipedia.exceptions.DisambiguationError as e:
|
||||
queries='\n'.join(str(e).split('\n')[1:])
|
||||
queries=queries.split('\n')
|
||||
page_py = wiki_wiki.page(queries[0])
|
||||
|
||||
return page_py.summary[0:NUMBER_CHARS_RETURN]
|
||||
|
||||
def search_and_save(self, keyword, lang):
|
||||
wikipedia.set_lang(lang)
|
||||
wiki_wiki = wikipediaapi.Wikipedia('icon (icon@iconsz.com)', language=lang)
|
||||
content_list = [self.__search_topic_and_save(keyword, wiki_wiki)]
|
||||
logger.warning(f"Crawler_Wikipedia topic:{topic}")
|
||||
return content_list
|
||||
# print(content_list)
|
||||
|
||||
def process(self, inputData):
|
||||
logger.warning("Crawler_Wikipedia / inputData", inputData)
|
||||
keyword = inputData["keyword"]
|
||||
# lang: en for English, zh for Chinese
|
||||
lang = inputData["lang"]
|
||||
|
||||
result = []
|
||||
try:
|
||||
result = self.search_and_save(keyword, lang)
|
||||
# print("result", result)
|
||||
except Exception:
|
||||
import traceback
|
||||
logger.warning(f"Crawler_NewsCN {traceback.format_exc()}")
|
||||
|
||||
logger.warning(
|
||||
f"Crawler_Wikipedia process completed】, keyword={keyword}, result len={len(result)}"
|
||||
)
|
||||
return result
|
||||
|
||||
if __name__ == "__main__":
|
||||
crwaler = Crawler_Wikipedia()
|
||||
crwaler.search_and_save('prc', 'en')
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
asttokens==2.4.1
|
||||
attrs==24.2.0
|
||||
backcall==0.2.0
|
||||
beautifulsoup4==4.12.3
|
||||
certifi==2024.8.30
|
||||
charset-normalizer==3.3.2
|
||||
comm==0.2.2
|
||||
contourpy==1.1.1
|
||||
cycler==0.12.1
|
||||
debugpy==1.8.6
|
||||
decorator==5.1.1
|
||||
exceptiongroup==1.2.2
|
||||
executing==2.1.0
|
||||
fonttools==4.54.1
|
||||
h11==0.14.0
|
||||
idna==3.10
|
||||
importlib-metadata==8.5.0
|
||||
importlib-resources==6.4.5
|
||||
ipykernel==6.29.5
|
||||
ipython==8.12.3
|
||||
jedi==0.19.1
|
||||
jupyter-client==8.6.3
|
||||
jupyter-core==5.7.2
|
||||
kiwisolver==1.4.7
|
||||
matplotlib==3.7.5
|
||||
matplotlib-inline==0.1.7
|
||||
mock==5.1.0
|
||||
nest-asyncio==1.6.0
|
||||
numpy==1.24.4
|
||||
outcome==1.3.0.post0
|
||||
packaging==24.1
|
||||
pandas==2.0.3
|
||||
parso==0.8.4
|
||||
pexpect==4.9.0
|
||||
pickleshare==0.7.5
|
||||
pillow==10.4.0
|
||||
platformdirs==4.3.6
|
||||
prompt-toolkit==3.0.48
|
||||
psutil==6.0.0
|
||||
ptyprocess==0.7.0
|
||||
pure-eval==0.2.3
|
||||
pygments==2.18.0
|
||||
pyparsing==3.1.4
|
||||
PySocks==1.7.1
|
||||
python-dateutil==2.9.0.post0
|
||||
pytz==2024.2
|
||||
pyzmq==26.2.0
|
||||
requests==2.32.3
|
||||
scipy==1.10.1
|
||||
selenium==4.25.0
|
||||
six==1.16.0
|
||||
sniffio==1.3.1
|
||||
sortedcontainers==2.4.0
|
||||
soupsieve==2.6
|
||||
stack-data==0.6.3
|
||||
subword-nmt==0.3.8
|
||||
tornado==6.4.1
|
||||
tqdm==4.66.5
|
||||
traitlets==5.14.3
|
||||
trio==0.26.2
|
||||
trio-websocket==0.11.1
|
||||
typing-extensions==4.12.2
|
||||
tzdata==2024.2
|
||||
urllib3==2.2.3
|
||||
wcwidth==0.2.13
|
||||
websocket-client==1.8.0
|
||||
wsproto==1.2.0
|
||||
zipp==3.20.2
|
||||
beautifulsoup4==4.12.3
|
||||
certifi==2024.8.30
|
||||
charset-normalizer==3.4.0
|
||||
idna==3.10
|
||||
requests==2.32.3
|
||||
setuptools==75.2.0
|
||||
soupsieve==2.6
|
||||
urllib3==2.2.3
|
||||
wikipedia==1.4.0
|
||||
Wikipedia-API==0.7.1
|
||||
Loading…
Reference in New Issue