Spaces:
Runtime error
Runtime error
Commit
·
0573e7b
1
Parent(s):
73855f3
findarticles
Browse files
huixiangdou/service/findarticles.py
CHANGED
@@ -6,6 +6,8 @@ from tqdm import tqdm
|
|
6 |
import json
|
7 |
import shutil
|
8 |
from loguru import logger
|
|
|
|
|
9 |
class ArticleRetrieval:
|
10 |
def __init__(self,
|
11 |
keywords: list,
|
@@ -32,19 +34,22 @@ class ArticleRetrieval:
|
|
32 |
return pmc_ids
|
33 |
|
34 |
# 解析XML文件
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
43 |
|
44 |
## 清洗XML文件
|
45 |
def _clean_xml(self,txt):
|
46 |
-
|
47 |
-
root = ET.fromstring(txt)
|
48 |
txt = self._get_all_text(root)
|
49 |
txt = txt.split('REFERENCES')[0] # 截取参考文献之前的文本
|
50 |
text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
|
@@ -89,5 +94,4 @@ if __name__ == '__main__':
|
|
89 |
if os.path.exists('repodir'):
|
90 |
shutil.rmtree('repodir')
|
91 |
articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5)
|
92 |
-
|
93 |
-
articelfinder.fetch_full_text(pmc_ids)
|
|
|
6 |
import json
|
7 |
import shutil
|
8 |
from loguru import logger
|
9 |
+
from lxml import etree
|
10 |
+
|
11 |
class ArticleRetrieval:
|
12 |
def __init__(self,
|
13 |
keywords: list,
|
|
|
34 |
return pmc_ids
|
35 |
|
36 |
# 解析XML文件
|
37 |
+
def _get_all_text(self, element):
|
38 |
+
"""递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
|
39 |
+
if element is None:
|
40 |
+
return ""
|
41 |
+
|
42 |
+
text = element.text or ""
|
43 |
+
for child in element:
|
44 |
+
text += self._get_all_text(child)
|
45 |
+
if child is not None and child.tail:
|
46 |
+
text += child.tail
|
47 |
+
return text
|
48 |
|
49 |
## 清洗XML文件
|
50 |
def _clean_xml(self,txt):
|
51 |
+
parser = etree.XMLParser(recover=True)
|
52 |
+
root = ET.fromstring(txt,parser=parser)
|
53 |
txt = self._get_all_text(root)
|
54 |
txt = txt.split('REFERENCES')[0] # 截取参考文献之前的文本
|
55 |
text = '\n\n'.join([t.strip() for t in txt.split('\n') if len(t.strip())>250])
|
|
|
94 |
if os.path.exists('repodir'):
|
95 |
shutil.rmtree('repodir')
|
96 |
articelfinder = ArticleRetrieval(keywords = ['covid-19'],repo_dir = 'repodir',retmax = 5)
|
97 |
+
articelfinder.initiallize()
|
|