Spaces:
Running
Running
| # import requests | |
| # import numpy as np | |
| # import pandas as pd | |
| # from utils import * | |
| # from pathlib import Path | |
| # from bs4 import BeautifulSoup | |
| # from add_sasa import * | |
| # def addModbaseModels(dataframe, path_to_input_files, path_to_output_files): | |
| # if len(dataframe) != 0: | |
| # # GET MODBASE MODELS | |
| # # Get IDs from data to retrieve only their models from MODBASE | |
| # dataframe.reset_index(inplace=True, drop=True) | |
| # existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*")) | |
| # existing_modbase_models = [str(i) for i in existing_modbase_models] | |
| # existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models] | |
| # existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*")) | |
| # existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind] | |
| # existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind] | |
| # modbase_reduced = pd.DataFrame(columns = ['uniprotID', 'target_begin', 'target_end', 'quality_score', | |
| # 'model_id', 'coordinates','AAonPDB', 'coordVAR']) | |
| # print('Retrieving ModBase models...\n') | |
| # modbase = pd.DataFrame( | |
| # columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id', | |
| # 'coordinates', 'AAonPDB', 'coordVAR']) | |
| # no_modbase = pd.DataFrame( | |
| # columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id', | |
| # 'coordinates', 'AAonPDB', 'coordVAR']) | |
| # # Get model files associated with each UniProtID | |
| # existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*")) | |
| # existing_free_sasa = [str(i) for i in existing_free_sasa] | |
| # existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa] | |
| # keep_cols = dataframe.columns | |
| # for i in dataframe.index: | |
| # coordDict = {} | |
| # protein = dataframe.at[i, 'uniprotID'] | |
| # varPos = int(dataframe.at[i, 'pos']) | |
| # wt = dataframe.at[i, 'wt'] | |
| # mut = dataframe.at[i, 'mut'] | |
| # datapoint = dataframe.at[i, 'datapoint'] | |
| # if protein not in existing_modbase_models: | |
| # print('Downloading Modbase models for ', protein) | |
| # url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein | |
| # req = requests.get(url) | |
| # name = path_to_output_files / 'modbase_structures' / f'{protein}.txt' | |
| # with open(name, 'wb') as f: | |
| # f.write(req.content) | |
| # else: | |
| # print('Model exists for', protein) | |
| # name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt') | |
| # with open(name, encoding="utf8") as f: | |
| # a = open(name, 'r').read() | |
| # soup = BeautifulSoup(a, 'lxml') | |
| # if soup.findAll('pdbfile') != []: | |
| # for pdb in soup.findAll('pdbfile'): | |
| # model_id = str(pdb.contents[1])[10:-11] | |
| # if model_id not in existing_modbase_models_ind: | |
| # with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', encoding="utf8") as individual: | |
| # individual.write(str('UniProt ID: ' + protein)) | |
| # individual.write('\n') | |
| # individual.write(str(pdb.contents[3])[10:-11].strip()) | |
| # run_freesasa( | |
| # Path(path_to_output_files / 'modbase_structures_individual' / f'{model_id.lower()}.txt'), | |
| # Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt'), | |
| # include_hetatms=True, | |
| # outdir=None, force_rerun=False, file_type='pdb') | |
| # filename = Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt') | |
| # st.write('filename', filename) | |
| # st.write('varPos', varPos) | |
| # st.write('wt', wt) | |
| # st.write('protein', protein) | |
| # st.write('path_to_output_files', path_to_output_files) | |
| # dataframe.loc[i, 'sasa'] = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb') | |
| # st.write('sasa', dataframe.loc[i, 'sasa'] ) | |
| # st.write('model_id', model_id) | |
| # with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m: | |
| # lines = m.readlines() | |
| # quality_score = -999 | |
| # for ind_line in lines: | |
| # if ind_line[0:10] == 'UniProt ID': | |
| # uniprot_id = ind_line.split(':')[1].strip() | |
| # if ind_line[0:23] == 'REMARK 220 TARGET BEGIN': | |
| # target_begin = ind_line[40:43].strip() | |
| # if ind_line[0:21] == 'REMARK 220 TARGET END': | |
| # target_end = ind_line[40:43].strip() | |
| # coordDict, AAonPDB, coordVAR = {},np.NaN,np.NaN | |
| # if (int(varPos) > int(target_begin)) & (int(varPos) < int(target_end)): | |
| # coordDict = {} | |
| # for ind_line in lines: | |
| # if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID': | |
| # model_id = ind_line[40:].strip() | |
| # if ind_line[0:15].strip() == 'REMARK 220 MPQS': | |
| # quality_score = ind_line[40:].strip() | |
| # if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA': | |
| # position = int(ind_line[22:26].strip()) | |
| # chain = ind_line[20:22].strip() | |
| # aminoacid = threeToOne(ind_line[17:20]) | |
| # coords = [ind_line[31:38].strip(), ind_line[39:46].strip(), ind_line[47:54].strip()] | |
| # coordDict[position] = coords | |
| # if position == int(varPos): | |
| # AAonPDB = aminoacid | |
| # coordVAR = str(coords) | |
| # if ind_line[0:3] == 'TER': | |
| # break | |
| # try: | |
| # k = pd.Series( | |
| # [uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR]) | |
| # new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin, | |
| # 'target_end': target_end, 'quality_score': quality_score, | |
| # 'model_id': model_id, 'coordinates': coordDict, | |
| # 'AAonPDB': AAonPDB, 'coordVAR': coordVAR} | |
| # modbase_reduced = modbase_reduced.append(new_row, ignore_index=True) | |
| # modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR']] | |
| # modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left') | |
| # modbase.quality_score = modbase.quality_score.astype(float) | |
| # modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False) | |
| # modbase.reset_index(inplace=True, drop=True) | |
| # modbase.fillna(np.NaN, inplace=True) | |
| # modbase.replace({'\'?\', ': '', | |
| # ', \'?\'': '', | |
| # '(': '', ')': '', | |
| # '[\'?\']': np.NaN, | |
| # '[]': np.NaN, | |
| # 'nan-nan': np.NaN, | |
| # '': np.NaN}, inplace=True) | |
| # except NameError: | |
| # print('This file doesnt have Quality Score. Replacer: -999', model_id) | |
| # else: | |
| # new_row = {'uniprotID': uniprot_id, 'wt': wt, | |
| # 'pos': varPos, 'mut': mut, 'datapoint': datapoint } | |
| # no_modbase = no_modbase.append(new_row, ignore_index=True) | |
| # else: | |
| # new_row = {'uniprotID': uniprot_id, 'wt': wt, | |
| # 'pos': varPos, 'mut': mut, 'datapoint': datapoint } | |
| # no_modbase = no_modbase.append(new_row, ignore_index=True) | |
| # no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])] | |
| # no_modbase = pd.concat([no_modbase, no_modbase_no_Coord]) | |
| # modbase = modbase[~pd.isna(modbase['coordVAR'])] | |
| # no_modbase = no_modbase[keep_cols] | |
| # return modbase, no_modbase | |
| import requests | |
| import numpy as np | |
| import pandas as pd | |
| from utils import * | |
| from pathlib import Path | |
| from bs4 import BeautifulSoup | |
| from add_sasa import * | |
| def addModbaseModels(dataframe, path_to_input_files, path_to_output_files): | |
| if len(dataframe) != 0: | |
| # GET MODBASE MODELS | |
| # Get IDs from data to retrieve only their models from MODBASE | |
| dataframe.reset_index(inplace=True, drop=True) | |
| existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*")) | |
| existing_modbase_models = [str(i) for i in existing_modbase_models] | |
| existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models] | |
| existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*")) | |
| existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind] | |
| existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind] | |
| modbase_reduced = pd.DataFrame(columns = ['uniprotID', 'target_begin', 'target_end', 'quality_score', | |
| 'model_id', 'coordinates','AAonPDB', 'coordVAR']) | |
| print('Retrieving ModBase models...\n') | |
| modbase = pd.DataFrame( | |
| columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id', | |
| 'coordinates', 'AAonPDB', 'coordVAR']) | |
| no_modbase = pd.DataFrame( | |
| columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id', | |
| 'coordinates', 'AAonPDB', 'coordVAR']) | |
| # Get model files associated with each UniProtID | |
| existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*")) | |
| existing_free_sasa = [str(i) for i in existing_free_sasa] | |
| existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa] | |
| keep_cols = dataframe.columns | |
| for i in dataframe.index: | |
| coordDict = {} | |
| protein = dataframe.at[i, 'uniprotID'] | |
| varPos = int(dataframe.at[i, 'pos']) | |
| wt = dataframe.at[i, 'wt'] | |
| mut = dataframe.at[i, 'mut'] | |
| datapoint = dataframe.at[i, 'datapoint'] | |
| if protein not in existing_modbase_models: | |
| print('Downloading Modbase models for ', protein) | |
| url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein | |
| req = requests.get(url) | |
| name = path_to_output_files / 'modbase_structures' / f'{protein}.txt' | |
| with open(name, 'wb') as f: | |
| f.write(req.content) | |
| else: | |
| print('Model exists for', protein) | |
| name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt') | |
| with open(name, encoding="utf8") as f: | |
| a = open(name, 'r').read() | |
| soup = BeautifulSoup(a, 'lxml') | |
| if soup.findAll('pdbfile') != []: | |
| for pdb in soup.findAll('pdbfile'): | |
| model_id = str(pdb.contents[1])[10:-11] | |
| if model_id not in existing_modbase_models_ind: | |
| with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', encoding="utf8") as individual: | |
| individual.write(str('UniProt ID: ' + protein)) | |
| individual.write('\n') | |
| individual.write(str(pdb.contents[3])[10:-11].strip()) | |
| run_freesasa( | |
| Path(path_to_output_files / 'modbase_structures_individual' / f'{model_id.lower()}.txt'), | |
| Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt'), | |
| include_hetatms=True, | |
| outdir=None, force_rerun=False, file_type='pdb') | |
| filename = Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt') | |
| st.write('filename', filename) | |
| st.write('varPos', varPos) | |
| st.write('wt', wt) | |
| st.write('protein', protein) | |
| st.write('path_to_output_files', path_to_output_files) | |
| sasa_val = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb') | |
| st.write('sasa', sasa_val) | |
| st.write('model_id', model_id) | |
| with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m: | |
| lines = m.readlines() | |
| quality_score = -999 | |
| for ind_line in lines: | |
| if ind_line[0:10] == 'UniProt ID': | |
| uniprot_id = ind_line.split(':')[1].strip() | |
| if ind_line[0:23] == 'REMARK 220 TARGET BEGIN': | |
| target_begin = ind_line[40:43].strip() | |
| if ind_line[0:21] == 'REMARK 220 TARGET END': | |
| target_end = ind_line[40:43].strip() | |
| coordDict, AAonPDB, coordVAR = {},np.NaN,np.NaN | |
| if (int(varPos) > int(target_begin)) & (int(varPos) < int(target_end)): | |
| coordDict = {} | |
| for ind_line in lines: | |
| if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID': | |
| model_id = ind_line[40:].strip() | |
| if ind_line[0:15].strip() == 'REMARK 220 MPQS': | |
| quality_score = ind_line[40:].strip() | |
| if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA': | |
| position = int(ind_line[22:26].strip()) | |
| chain = ind_line[20:22].strip() | |
| aminoacid = threeToOne(ind_line[17:20]) | |
| coords = [ind_line[31:38].strip(), ind_line[39:46].strip(), ind_line[47:54].strip()] | |
| coordDict[position] = coords | |
| if position == int(varPos): | |
| AAonPDB = aminoacid | |
| coordVAR = str(coords) | |
| if ind_line[0:3] == 'TER': | |
| break | |
| try: | |
| k = pd.Series( | |
| [uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR, sasa_val]) | |
| new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin, | |
| 'target_end': target_end, 'quality_score': quality_score, | |
| 'model_id': model_id, 'coordinates': coordDict, | |
| 'AAonPDB': AAonPDB, 'coordVAR': coordVAR, 'sasa':sasa_val} | |
| modbase_reduced = modbase_reduced.append(new_row, ignore_index=True) | |
| modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR', 'sasa']] | |
| modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left') | |
| modbase.quality_score = modbase.quality_score.astype(float) | |
| modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False) | |
| modbase.reset_index(inplace=True, drop=True) | |
| modbase.fillna(np.NaN, inplace=True) | |
| modbase.replace({'\'?\', ': '', | |
| ', \'?\'': '', | |
| '(': '', ')': '', | |
| '[\'?\']': np.NaN, | |
| '[]': np.NaN, | |
| 'nan-nan': np.NaN, | |
| '': np.NaN}, inplace=True) | |
| except NameError: | |
| print('This file doesnt have Quality Score. Replacer: -999', model_id) | |
| else: | |
| new_row = {'uniprotID': uniprot_id, 'wt': wt, | |
| 'pos': varPos, 'mut': mut, 'datapoint': datapoint } | |
| no_modbase = no_modbase.append(new_row, ignore_index=True) | |
| else: | |
| new_row = {'uniprotID': uniprot_id, 'wt': wt, | |
| 'pos': varPos, 'mut': mut, 'datapoint': datapoint } | |
| no_modbase = no_modbase.append(new_row, ignore_index=True) | |
| no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])] | |
| no_modbase = pd.concat([no_modbase, no_modbase_no_Coord]) | |
| modbase = modbase[~pd.isna(modbase['coordVAR'])] | |
| st.write(modbase['datapoint', 'model_id', 'sasa']) | |
| no_modbase = no_modbase[keep_cols] | |
| return modbase, no_modbase |