Spaces:
Runtime error
Runtime error
Commit
·
0573e7b
1
Parent(s):
73855f3
findarticles
Browse files
huixiangdou/service/findarticles.py
CHANGED
|
@@ -6,6 +6,8 @@ from tqdm import tqdm
|
|
| 6 |
import json
|
| 7 |
import shutil
|
| 8 |
from loguru import logger
|
|
|
|
|
|
|
| 9 |
class ArticleRetrieval:
|
| 10 |
def __init__(self,
|
| 11 |
keywords: list,
|
|
@@ -32,19 +34,22 @@ class ArticleRetrieval:
|
|
| 32 |
return pmc_ids
|
| 33 |
|
| 34 |
# 解析XML文件
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
## 清洗XML文件
|
| 45 |
def _clean_xml(self,txt):
|
| 46 |
-
|
| 47 |
-
root = ET.fromstring(txt)
|
| 48 |
txt = self._get_all_text(root)
|
| 49 |
txt = txt.split('REFERENCES')[0] # 截取参考文献之前的文本
|
| 50 |
text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
|
|
@@ -89,5 +94,4 @@ if __name__ == '__main__':
|
|
| 89 |
if os.path.exists('repodir'):
|
| 90 |
shutil.rmtree('repodir')
|
| 91 |
articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5)
|
| 92 |
-
|
| 93 |
-
articelfinder.fetch_full_text(pmc_ids)
|
|
|
|
| 6 |
import json
|
| 7 |
import shutil
|
| 8 |
from loguru import logger
|
| 9 |
+
from lxml import etree
|
| 10 |
+
|
| 11 |
class ArticleRetrieval:
|
| 12 |
def __init__(self,
|
| 13 |
keywords: list,
|
|
|
|
| 34 |
return pmc_ids
|
| 35 |
|
| 36 |
# 解析XML文件
|
| 37 |
+
def _get_all_text(self, element):
|
| 38 |
+
"""递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
|
| 39 |
+
if element is None:
|
| 40 |
+
return ""
|
| 41 |
+
|
| 42 |
+
text = element.text or ""
|
| 43 |
+
for child in element:
|
| 44 |
+
text += self._get_all_text(child)
|
| 45 |
+
if child is not None and child.tail:
|
| 46 |
+
text += child.tail
|
| 47 |
+
return text
|
| 48 |
|
| 49 |
## 清洗XML文件
|
| 50 |
def _clean_xml(self,txt):
|
| 51 |
+
parser = etree.XMLParser(recover=True)
|
| 52 |
+
root = ET.fromstring(txt,parser=parser)
|
| 53 |
txt = self._get_all_text(root)
|
| 54 |
txt = txt.split('REFERENCES')[0] # 截取参考文献之前的文本
|
| 55 |
text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
|
|
|
|
| 94 |
if os.path.exists('repodir'):
|
| 95 |
shutil.rmtree('repodir')
|
| 96 |
articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5)
|
| 97 |
+
articelfinder.initiallize()
|
|
|