Spaces:
Build error
Build error
Add time tracking for each process
Browse files
app.py
CHANGED
|
@@ -142,6 +142,7 @@ if uploaded_files:
|
|
| 142 |
# st.write(pdf.name)
|
| 143 |
L = []
|
| 144 |
# Entity Extraction
|
|
|
|
| 145 |
st.write("β Extracting Entities ...")
|
| 146 |
bytes_data = uploaded_file.read()
|
| 147 |
journal = Journal(uploaded_file.name, bytes_data)
|
|
@@ -181,12 +182,14 @@ if uploaded_files:
|
|
| 181 |
chunkdf.append(df)
|
| 182 |
|
| 183 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
| 184 |
-
st.write("β Entities Extraction Done ..")
|
| 185 |
time.sleep(0.1)
|
|
|
|
| 186 |
st.write("β Generating Summary ...")
|
| 187 |
summary = get_summ(pdf.name)
|
| 188 |
-
st.write("β Generating Summary Done ..")
|
| 189 |
time.sleep(0.1)
|
|
|
|
| 190 |
st.write("β Table Extraction in progress ...")
|
| 191 |
# Table Extraction
|
| 192 |
# L = []
|
|
@@ -569,7 +572,7 @@ if uploaded_files:
|
|
| 569 |
else:
|
| 570 |
L.append(row)
|
| 571 |
|
| 572 |
-
st.write("β Table Extraction Done ...")
|
| 573 |
status.update(label="Gene and SNPs succesfully collected.")
|
| 574 |
L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
|
| 575 |
L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
|
|
@@ -624,6 +627,7 @@ if uploaded_files:
|
|
| 624 |
with NamedTemporaryFile(dir='.', suffix=".pdf") as rotated_pdf:
|
| 625 |
pdf_writer.write(rotated_pdf.name)
|
| 626 |
# Entity Extraction
|
|
|
|
| 627 |
st.write("β Extracting Entities ...")
|
| 628 |
bytes_data = uploaded_file.read()
|
| 629 |
journal = Journal(uploaded_file.name, bytes_data)
|
|
@@ -654,12 +658,14 @@ if uploaded_files:
|
|
| 654 |
chunkdf.append(df)
|
| 655 |
|
| 656 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
| 657 |
-
st.write("β Entities Extraction Done ..")
|
| 658 |
time.sleep(0.1)
|
|
|
|
| 659 |
st.write("β Generating Summary ...")
|
| 660 |
summary = get_summ(pdf.name)
|
| 661 |
-
st.write("β Generating Summary Done ..")
|
| 662 |
time.sleep(0.1)
|
|
|
|
| 663 |
st.write("β Table Extraction in progress ...")
|
| 664 |
|
| 665 |
# Table Extraction
|
|
@@ -930,53 +936,12 @@ if uploaded_files:
|
|
| 930 |
})
|
| 931 |
else:
|
| 932 |
L.append(row)
|
| 933 |
-
|
| 934 |
-
|
| 935 |
-
|
| 936 |
-
|
| 937 |
-
|
| 938 |
-
|
| 939 |
-
'Title' : concat['title'][0],
|
| 940 |
-
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
| 941 |
-
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
| 942 |
-
'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
|
| 943 |
-
'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
|
| 944 |
-
'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
|
| 945 |
-
'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
|
| 946 |
-
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
| 947 |
-
'Recommendation' : summary,
|
| 948 |
-
}
|
| 949 |
-
}
|
| 950 |
-
if not row['SNPs'].startswith("rs"):
|
| 951 |
-
row.update({
|
| 952 |
-
'SNPs' : "-"
|
| 953 |
-
})
|
| 954 |
-
else:
|
| 955 |
-
L.append(row)
|
| 956 |
-
except KeyError:
|
| 957 |
-
row = literal_eval(output_list[2]['result'].split('\n')[i])
|
| 958 |
-
row = {**row, **{
|
| 959 |
-
'Title' : concat['title'][0],
|
| 960 |
-
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
| 961 |
-
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
| 962 |
-
'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
|
| 963 |
-
'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
|
| 964 |
-
'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
|
| 965 |
-
'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
|
| 966 |
-
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
| 967 |
-
'Recommendation' : summary,
|
| 968 |
-
}
|
| 969 |
-
}
|
| 970 |
-
if not row['SNPs'].startswith("rs"):
|
| 971 |
-
row.update({
|
| 972 |
-
'SNPs' : "-"
|
| 973 |
-
})
|
| 974 |
-
else:
|
| 975 |
-
L.append(row)
|
| 976 |
-
except ValueError:
|
| 977 |
-
if type(output_list[2]['result'].split('\n')[i]) is dict:
|
| 978 |
-
row = output_list[2]['result'].split('\n')[i]
|
| 979 |
-
row = {**row, **{
|
| 980 |
'Title' : concat['title'][0],
|
| 981 |
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
| 982 |
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
|
@@ -987,15 +952,15 @@ if uploaded_files:
|
|
| 987 |
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
| 988 |
'Recommendation' : summary,
|
| 989 |
}
|
| 990 |
-
|
| 991 |
-
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
| 995 |
-
|
| 996 |
-
|
| 997 |
-
|
| 998 |
-
row = literal_eval(
|
| 999 |
row = {**row, **{
|
| 1000 |
'Title' : concat['title'][0],
|
| 1001 |
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
|
@@ -1014,7 +979,49 @@ if uploaded_files:
|
|
| 1014 |
})
|
| 1015 |
else:
|
| 1016 |
L.append(row)
|
| 1017 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1018 |
status.update(label="Gene and SNPs succesfully collected.")
|
| 1019 |
L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
|
| 1020 |
L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
|
|
@@ -1048,6 +1055,7 @@ if uploaded_files:
|
|
| 1048 |
pdf.write(uploaded_file.getbuffer())
|
| 1049 |
|
| 1050 |
# Entity Extraction
|
|
|
|
| 1051 |
st.write("β Extracting Entities ...")
|
| 1052 |
bytes_data = uploaded_file.read()
|
| 1053 |
journal = Journal(uploaded_file.name, bytes_data)
|
|
@@ -1068,8 +1076,9 @@ if uploaded_files:
|
|
| 1068 |
chunkdf.append(df)
|
| 1069 |
|
| 1070 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
| 1071 |
-
st.write("β Entities Extraction Done ..")
|
| 1072 |
time.sleep(0.1)
|
|
|
|
| 1073 |
st.write("β Generating Summary ...")
|
| 1074 |
|
| 1075 |
if 'SNPs' in list(concat.columns):
|
|
@@ -1081,7 +1090,7 @@ if uploaded_files:
|
|
| 1081 |
|
| 1082 |
summary = get_summ(pdf.name)
|
| 1083 |
time.sleep(0.1)
|
| 1084 |
-
st.write("β Generating Summary Done...")
|
| 1085 |
for i in range(len(concat)):
|
| 1086 |
if (len(concat['genes_locus'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
|
| 1087 |
for g in concat['genes_locus'][i].split(','):
|
|
|
|
| 142 |
# st.write(pdf.name)
|
| 143 |
L = []
|
| 144 |
# Entity Extraction
|
| 145 |
+
start_time_ext = time.time()
|
| 146 |
st.write("β Extracting Entities ...")
|
| 147 |
bytes_data = uploaded_file.read()
|
| 148 |
journal = Journal(uploaded_file.name, bytes_data)
|
|
|
|
| 182 |
chunkdf.append(df)
|
| 183 |
|
| 184 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
| 185 |
+
st.write("β Entities Extraction Done ..", round((time.time() - start_time_ext) / 60, 2), "minutes")
|
| 186 |
time.sleep(0.1)
|
| 187 |
+
start_time_summ = time.time()
|
| 188 |
st.write("β Generating Summary ...")
|
| 189 |
summary = get_summ(pdf.name)
|
| 190 |
+
st.write("β Generating Summary Done ..", round((time.time() - start_time_summ) / 60, 2), "minutes")
|
| 191 |
time.sleep(0.1)
|
| 192 |
+
start_time_tab = time.time()
|
| 193 |
st.write("β Table Extraction in progress ...")
|
| 194 |
# Table Extraction
|
| 195 |
# L = []
|
|
|
|
| 572 |
else:
|
| 573 |
L.append(row)
|
| 574 |
|
| 575 |
+
st.write("β Table Extraction Done ...", round((time.time() - start_time_summ) / 60, 2), "minutes")
|
| 576 |
status.update(label="Gene and SNPs succesfully collected.")
|
| 577 |
L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
|
| 578 |
L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
|
|
|
|
| 627 |
with NamedTemporaryFile(dir='.', suffix=".pdf") as rotated_pdf:
|
| 628 |
pdf_writer.write(rotated_pdf.name)
|
| 629 |
# Entity Extraction
|
| 630 |
+
start_time_ext = time.time()
|
| 631 |
st.write("β Extracting Entities ...")
|
| 632 |
bytes_data = uploaded_file.read()
|
| 633 |
journal = Journal(uploaded_file.name, bytes_data)
|
|
|
|
| 658 |
chunkdf.append(df)
|
| 659 |
|
| 660 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
| 661 |
+
st.write("β Entities Extraction Done ..", round((time.time() - start_time_ext) / 60, 2), "minutes")
|
| 662 |
time.sleep(0.1)
|
| 663 |
+
start_time_summ = time.time()
|
| 664 |
st.write("β Generating Summary ...")
|
| 665 |
summary = get_summ(pdf.name)
|
| 666 |
+
st.write("β Generating Summary Done ..", round((time.time() - start_time_summ) / 60, 2), "minutes")
|
| 667 |
time.sleep(0.1)
|
| 668 |
+
start_time_tab = time.time()
|
| 669 |
st.write("β Table Extraction in progress ...")
|
| 670 |
|
| 671 |
# Table Extraction
|
|
|
|
| 936 |
})
|
| 937 |
else:
|
| 938 |
L.append(row)
|
| 939 |
+
# 3
|
| 940 |
+
for i in range(len(output_list[2]['result'].split('\n'))):
|
| 941 |
+
if output_list[2]['result'].split('\n')[i] != "":
|
| 942 |
+
try:
|
| 943 |
+
row = literal_eval(output_list[2]['result'].split('\n')[i])[0]
|
| 944 |
+
row = {**row, **{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 945 |
'Title' : concat['title'][0],
|
| 946 |
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
| 947 |
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
|
|
|
| 952 |
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
| 953 |
'Recommendation' : summary,
|
| 954 |
}
|
| 955 |
+
}
|
| 956 |
+
if not row['SNPs'].startswith("rs"):
|
| 957 |
+
row.update({
|
| 958 |
+
'SNPs' : "-"
|
| 959 |
+
})
|
| 960 |
+
else:
|
| 961 |
+
L.append(row)
|
| 962 |
+
except KeyError:
|
| 963 |
+
row = literal_eval(output_list[2]['result'].split('\n')[i])
|
| 964 |
row = {**row, **{
|
| 965 |
'Title' : concat['title'][0],
|
| 966 |
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
|
|
|
| 979 |
})
|
| 980 |
else:
|
| 981 |
L.append(row)
|
| 982 |
+
except ValueError:
|
| 983 |
+
if type(output_list[2]['result'].split('\n')[i]) is dict:
|
| 984 |
+
row = output_list[2]['result'].split('\n')[i]
|
| 985 |
+
row = {**row, **{
|
| 986 |
+
'Title' : concat['title'][0],
|
| 987 |
+
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
| 988 |
+
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
| 989 |
+
'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
|
| 990 |
+
'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
|
| 991 |
+
'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
|
| 992 |
+
'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
|
| 993 |
+
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
| 994 |
+
'Recommendation' : summary,
|
| 995 |
+
}
|
| 996 |
+
}
|
| 997 |
+
if not row['SNPs'].startswith("rs"):
|
| 998 |
+
row.update({
|
| 999 |
+
'SNPs' : "-"
|
| 1000 |
+
})
|
| 1001 |
+
else:
|
| 1002 |
+
L.append(row)
|
| 1003 |
+
except SyntaxError:
|
| 1004 |
+
row = literal_eval("""{}""".format(output_list[2]['result'].split('\n')[i]))
|
| 1005 |
+
row = {**row, **{
|
| 1006 |
+
'Title' : concat['title'][0],
|
| 1007 |
+
'Authors' : concat['authors'][0] if 'authors' in list(concat.columns) else '',
|
| 1008 |
+
'Publisher Name' : concat['publisher_name'][0] if 'publisher_name' in list(concat.columns) else '',
|
| 1009 |
+
'Publication Year' : get_valid_year(' '.join(concat['year_of_publication'].values.tolist())) if 'year_of_publication' in concat.columns else concat.assign(year_of_publication='')['year_of_publication'],
|
| 1010 |
+
'Population' : upper_abbreviation(' '.join(concat['population_race'].values.tolist()).replace('Unknown', '').title()) if 'population_race' in concat.columns else concat.assign(population_race='')['population_race'],
|
| 1011 |
+
'Sample Size' : sample_size_postproc(' '.join(concat['sample_size'].values.tolist()).replace('Unknown', '').title()) if 'sample_size' in concat.columns else concat.assign(sample_size='')['sample_size'],
|
| 1012 |
+
'Study Methodology' : upper_abbreviation(' '.join(concat['study_methodology'].values.tolist()).replace('Unknown', '').title()) if 'study_methodology' in concat.columns else concat.assign(study_methodology='')['study_methodology'],
|
| 1013 |
+
'Study Level' : upper_abbreviation(' '.join(concat['study_level'].values.tolist()).replace('Unknown', '').title()) if 'study_level' in concat.columns else concat.assign(study_level='')['study_level'],
|
| 1014 |
+
'Recommendation' : summary,
|
| 1015 |
+
}
|
| 1016 |
+
}
|
| 1017 |
+
if not row['SNPs'].startswith("rs"):
|
| 1018 |
+
row.update({
|
| 1019 |
+
'SNPs' : "-"
|
| 1020 |
+
})
|
| 1021 |
+
else:
|
| 1022 |
+
L.append(row)
|
| 1023 |
+
|
| 1024 |
+
st.write("β Table Extraction Done", round((time.time() - start_time_summ) / 60, 2), "minutes")
|
| 1025 |
status.update(label="Gene and SNPs succesfully collected.")
|
| 1026 |
L = [{key: ''.join(['' if item == 'Unknow' else item for item in value]) for key, value in d.items()} for d in L]
|
| 1027 |
L = [{key: ''.join(['Not Available' if item == '' else item for item in value]) for key, value in d.items()} for d in L]
|
|
|
|
| 1055 |
pdf.write(uploaded_file.getbuffer())
|
| 1056 |
|
| 1057 |
# Entity Extraction
|
| 1058 |
+
start_time_ext = time.time()
|
| 1059 |
st.write("β Extracting Entities ...")
|
| 1060 |
bytes_data = uploaded_file.read()
|
| 1061 |
journal = Journal(uploaded_file.name, bytes_data)
|
|
|
|
| 1076 |
chunkdf.append(df)
|
| 1077 |
|
| 1078 |
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
| 1079 |
+
st.write("β Entities Extraction Done ..", round((time.time() - start_time_ext) / 60, 2), "minutes")
|
| 1080 |
time.sleep(0.1)
|
| 1081 |
+
start_time_summ = time.time()
|
| 1082 |
st.write("β Generating Summary ...")
|
| 1083 |
|
| 1084 |
if 'SNPs' in list(concat.columns):
|
|
|
|
| 1090 |
|
| 1091 |
summary = get_summ(pdf.name)
|
| 1092 |
time.sleep(0.1)
|
| 1093 |
+
st.write("β Generating Summary Done...", round((time.time() - start_time_summ) / 60, 2), "minutes")
|
| 1094 |
for i in range(len(concat)):
|
| 1095 |
if (len(concat['genes_locus'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
|
| 1096 |
for g in concat['genes_locus'][i].split(','):
|