mishtert commited on
Commit
7b7b4e7
·
1 Parent(s): 31d27d3

Upload pharmap_url.py

Browse files
Files changed (1) hide show
  1. pharmap_url.py +154 -0
pharmap_url.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ # import utils.pharmap_utils.layout as lt
4
+ from utils.pharmap_utils.batutils import *
5
+ # import stanza
6
+
7
+ import requests
8
+ # import os.path
9
+ import io
10
+ # import PyPDF2
11
+ from pypdf.pdf import PdfFileReader
12
+ from urllib.request import Request, urlopen
13
+ from bs4 import BeautifulSoup
14
+ from bs4.element import Comment
15
+
16
+ # from utils.pharmap_utils.dtxutils import *
17
+ # from utils.pharmap_utils.dictutils import *
18
+
19
+ from utils.pharmap_utils.stanzautils import *
20
+
21
+
22
+ # @st.cache(show_spinner=True)
23
+ def get_ner(contents):
24
+ print('inside get ner')
25
+ content_list = []
26
+ st.write('Reading the page...')
27
+ nlp = call_nlp_pipeline()
28
+ doc = nlp(contents.strip())
29
+ st.write('Getting disease names...')
30
+ for ent in doc.entities:
31
+ if ent.type == 'DISEASE':
32
+ content_list.append(ent.text.replace('\n', ''))
33
+ content_list = list(set(content_list))
34
+ print('got the disease names', content_list)
35
+ st.write('Got the disease names...')
36
+ return content_list
37
+
38
+
39
+ def get_ta_mapped_url(content_list):
40
+ print('inside get_ta_mapped')
41
+ st.write(content_list)
42
+ # content_list = content_list
43
+ st.write('Trying to get Mesh Name..')
44
+ print('Trying to get Mesh Name..')
45
+ ta_list = []
46
+ ta = []
47
+ for condition_text in content_list:
48
+ # print("printing inside the for loop",condition_text)
49
+ ta = non_url_flow(condition_text)
50
+ # print(ta)
51
+ ta_list.append(ta)
52
+ # print(ta_list)
53
+ flat_list = [item for sublist in ta_list for item in sublist]
54
+ ta = list(set(flat_list))
55
+ print("Outside the loop", ta)
56
+ return ta
57
+
58
+
59
+ def check_pdf_html(url):
60
+ r = requests.get(url)
61
+ content_type = r.headers.get('content-type')
62
+ print(content_type)
63
+ if 'application/pdf' in content_type:
64
+ ext = 'pdf'
65
+ elif 'text/html' in content_type:
66
+ ext = 'html'
67
+ else:
68
+ ext = ''
69
+ print('Unknown type: {}'.format(content_type))
70
+ print(ext)
71
+ return ext
72
+
73
+
74
+ # @st.cache
75
+ def get_disease_html(u):
76
+ print('inside get disease html')
77
+ # u="https://www.exelixis.com/pipeline/"
78
+ # "https://www.roche.com/dam/jcr:22160102-e04d-4484-ae3b-0f474105647e/en/diaq321.pdf"
79
+ url = Request(u, headers={'User-Agent': 'Mozilla/5.0'})
80
+ html = urlopen(url).read()
81
+ soup = BeautifulSoup(html, features="html.parser")
82
+ for script in soup(["script", "style"]):
83
+ script.extract()
84
+ for footer in soup.findAll('header'):
85
+ footer.decompose()
86
+ for footer in soup.findAll('footer'):
87
+ footer.decompose()
88
+ text = soup.get_text()
89
+ lines = (line.strip() for line in text.splitlines())
90
+ chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
91
+ text = '\n'.join(chunk for chunk in chunks if chunk)
92
+ # st.write(text)
93
+ result = get_ner(text)
94
+ return result
95
+
96
+
97
+ # @st.cache(persist=True,show_spinner=True)
98
+ def get_disease_pdf(url):
99
+ st.write('get pdf disease')
100
+ r = requests.get(url)
101
+ f = io.BytesIO(r.content)
102
+ reader = PdfFileReader(f)
103
+ # pnum = reader.getNumPages()
104
+ # p_num = []
105
+ data = []
106
+ df = pd.DataFrame()
107
+ content_list = []
108
+ pnum = 2
109
+ for p in range(pnum):
110
+ contents = reader.getPage(p).extractText()
111
+ content_list = get_ner(contents)
112
+ # doc = nlp(contents.strip())
113
+ # for ent in doc.entities:
114
+ # if ent.type=='DISEASE':
115
+ # content_list.append(ent.text.replace('\n',''))
116
+ # content_list = list(set(content_list))
117
+ # print(content_list)
118
+ # p_num = [p+1]
119
+ # print('pagenum',p_num)
120
+ # print('values',content_list)
121
+ a_dictionary = {'pno:': [p + 1],
122
+ 'conditions': content_list
123
+ }
124
+ content_list = []
125
+ # print('a_dictionary',a_dictionary)
126
+ data.append(a_dictionary)
127
+ f.close()
128
+ df = df.append(data, True)
129
+ return df
130
+
131
+
132
+ def get_link_mapped(url):
133
+ # st.write(url)
134
+ # url = 'https://www.gene.com/medical-professionals/pipeline'
135
+ try:
136
+ get = check_pdf_html(url)
137
+ # st.write(get)
138
+ except:
139
+ get = 'invalid URL'
140
+ if get == 'pdf':
141
+ # st.write('inside pdf')
142
+ pdf_mapped_df = get_disease_pdf(url)
143
+ st.dataframe(pdf_mapped_df)
144
+ elif get == 'html':
145
+ # st.write('inside html')
146
+ # st.write(url)
147
+ # print('html')
148
+ content_list = get_disease_html(url)
149
+ ta = get_ta_mapped_url(content_list)
150
+ st.write(ta)
151
+
152
+ elif get == 'invalid URL':
153
+ print('invalid')
154
+