Spaces:
Running
Running
| import streamlit as st | |
| import time | |
| import json | |
| from gensim.models import Word2Vec | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import squarify | |
| import numpy as np | |
| # Define the HTML and CSS styles | |
| st.markdown(""" | |
| <style> | |
| body { | |
| background-color: #EBF5FB; | |
| # color: #ffffff; | |
| } | |
| .stApp { | |
| background-color: #EBF5FB; | |
| # color: #ffffff; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| st.header("Word2Vec App for Clotting Pubmed Database.") | |
| text_input_value = st.text_input("Enter one term to search within the Clotting database", max_chars=50) | |
| query = text_input_value | |
| query = query.lower() | |
| # query = input ("Enter your keyword(s):") | |
| if query: | |
| if query.isalpha(): | |
| bar = st.progress(0) | |
| time.sleep(.2) | |
| st.caption(":LightSkyBlue[searching 40123 PubMed abstracts]") | |
| for i in range(10): | |
| bar.progress((i + 1) * 10) | |
| time.sleep(.1) | |
| else: | |
| st.write('Please omit numbers in term') | |
| try: | |
| model = Word2Vec.load("pubmed_model_clotting") # you can continue training with the loaded model! | |
| words = list(model.wv.key_to_index) | |
| X = model.wv[model.wv.key_to_index] | |
| model2 = model.wv[query] | |
| df = pd.DataFrame(X) | |
| except: | |
| st.error("Term occurrence is too low - please try another term") | |
| st.stop() | |
| # def findRelationships(query, df): | |
| table = model.wv.most_similar_cosmul(query, topn=10000) | |
| table = (pd.DataFrame(table)) | |
| table.index.name = 'Rank' | |
| table.columns = ['Word', 'SIMILARITY'] | |
| print() | |
| print("Similarity to " + str(query)) | |
| pd.set_option('display.max_rows', None) | |
| print(table.head(50)) | |
| # table.head(10).to_csv("clotting_sim1.csv", index=True) | |
| # short_table = table.head(50) | |
| # print(table) | |
| st.subheader(f"Top 10 Words closely related to {query}") | |
| # calculate the sizes of the squares in the treemap | |
| short_table = table.head(10) | |
| short_table.index += 1 | |
| short_table.index = 1 / short_table.index | |
| sizes = short_table.index.tolist() | |
| cmap = plt.cm.Greens(np.linspace(0.05, .5, len(sizes))) | |
| color = [cmap[i] for i in range(len(sizes))] | |
| short_table.set_index('Word', inplace=True) | |
| squarify.plot(sizes=sizes, label=short_table.index.tolist(), color=color, edgecolor="#EBF5FB", | |
| text_kwargs={'fontsize': 10}) | |
| # # plot the treemap using matplotlib | |
| plt.axis('off') | |
| fig = plt.gcf() | |
| fig.patch.set_facecolor('#EBF5FB') | |
| # # display the treemap in Streamlit | |
| st.pyplot(fig) | |
| plt.clf() | |
| csv = table.head(100).to_csv().encode('utf-8') | |
| st.download_button(label="download top 100 words (csv)", data=csv, file_name='clotting_words.csv', mime='text/csv') | |
| # st.write(short_table) | |
| # | |
| print() | |
| print("Human genes similar to " + str(query)) | |
| df1 = table | |
| df2 = pd.read_csv('Human_Genes.csv') | |
| m = df1.Word.isin(df2.symbol) | |
| df1 = df1[m] | |
| df1.rename(columns={'Word': 'Human Gene'}, inplace=True) | |
| df1["Human Gene"] = df1["Human Gene"].str.upper() | |
| print(df1.head(50)) | |
| print() | |
| # df1.head(50).to_csv("clotting_sim2.csv", index=True, header=False) | |
| # time.sleep(2) | |
| st.subheader(f"Top 10 Genes closely related to {query}") | |
| df10 = df1.head(10) | |
| df10.index = 1 / df10.index | |
| sizes = df10.index.tolist() | |
| cmap2 = plt.cm.Blues(np.linspace(0.05, .5, len(sizes))) | |
| color2 = [cmap2[i] for i in range(len(sizes))] | |
| df10.set_index('Human Gene', inplace=True) | |
| squarify.plot(sizes=sizes, label=df10.index.tolist(), color=color2, edgecolor="#EBF5FB", | |
| text_kwargs={'fontsize': 12}) | |
| # | |
| # # plot the treemap using matplotlib | |
| plt.axis('off') | |
| fig2 = plt.gcf() | |
| fig2.patch.set_facecolor('#EBF5FB') | |
| # plt.show() | |
| # | |
| # # display the treemap in Streamlit | |
| st.pyplot(fig2) | |
| csv = df1.head(100).to_csv().encode('utf-8') | |
| st.download_button(label="download top 100 genes (csv)", data=csv, file_name='clotting_genes.csv', mime='text/csv') | |
| # findRelationships(query, df) | |
| # model = gensim.models.KeyedVectors.load_word2vec_format('pubmed_model_clotting', binary=True) | |
| # similar_words = model.most_similar(word) | |
| # output = json.dumps({"word": word, "similar_words": similar_words}) | |
| # st.write(output) | |