# PROTAC-DB Article Scraping

This notebook is used to scrape the article data from the PROTAC-DB website. The data is then saved in a CSV file for further analysis.

This code has been developed to double check that the reported information in the PROTAC-DB website is the same as the one reported in their CSV downloadeable file.

In [1]:
# from IPython.display import display_html

import logging
import warnings
import re
import os
import numpy as np
import pandas as pd
import pickle
import pickle
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import AllChem
from typing import Literal, Union, List, Dict, Any, Callable
from collections import defaultdict
from tqdm.auto import tqdm
from rdkit import RDLogger

RDLogger.DisableLog('rdApp.*')

Filter out some warnings...

In [2]:
def set_global_logging_level(level=logging.ERROR, prefices=[""]):
    """
    Override logging levels of different modules based on their name as a prefix.
    It needs to be invoked after the modules have been loaded so that their loggers have been initialized.

    Args:
        - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR
        - prefices: list of one or more str prefices to match (e.g. ["transformers", "torch"]). Optional.
          Default is `[""]` to match all active loggers.
          The match is a case-sensitive `module_name.startswith(prefix)`
    """
    prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
    for name in logging.root.manager.loggerDict:
        if re.match(prefix_re, name):
            logging.getLogger(name).setLevel(level)


# Filter out annoying Pytorch Lightning printouts
warnings.filterwarnings('ignore')
warnings.filterwarnings(
    'ignore', '.*Covariance of the parameters could not be estimated.*')
warnings.filterwarnings(
    'ignore', '.*You seem to be using the pipelines sequentially on GPU.*')

Setup working directories:

In [3]:
data_dir = os.path.join(os.getcwd(), '..', 'data')
src_dir = os.path.join(os.getcwd(), '..', 'src')
fig_dir = os.path.join(data_dir, 'figures')
checkpoint_dir = os.path.join(os.getcwd(), '..', 'checkpoints')
dirs_to_make = [
    data_dir,
    # os.path.join(data_dir, 'raw'),
    # os.path.join(data_dir, 'processed'),
    fig_dir,
    # os.path.join(data_dir, 'train'),
    # os.path.join(data_dir, 'val'),
    # os.path.join(data_dir, 'test'),
    # src_dir,
    # checkpoint_dir,
]
for d in dirs_to_make:
    if not os.path.exists(d):
        os.makedirs(d)

Download or load the raw PROTAC-DB dataset:

In [4]:
protacdb_file = os.path.join(data_dir, 'PROTAC-DB.csv')
protacdb_url = 'http://cadd.zju.edu.cn/protacdb/statics/binaryDownload/csv/protac/protac.csv'
if os.path.exists(protacdb_file):
    protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)
    print(f'Loaded protac.csv')
else:
    print(f'Downloading {protacdb_url}')
    !wget {protacdb_url} {protacdb_file}
    protac_df = pd.read_csv(protacdb_file).reset_index(drop=True)
    print(f'PROTAC-DB loaded')

old2new = {
    'E3 ligase': 'E3 Ligase',
}
protac_df = protac_df.rename(columns=old2new)

Loaded protac.csv


In [7]:
scraped_protac_df = pd.read_csv(os.path.join(
    data_dir, 'PROTAC-DB-Scraped.csv'))
# Rename columns
old2new = {
    "Assay (Percent degradation)": "Percent degradation (%)",
    "PROTAC to Target": "Assay (Percent degradation)",
    "DOI": "Article DOI",
    "DC50": "DC50 (nM)",
    "Dmax": "Dmax (%)",
}
scraped_protac_df = scraped_protac_df.rename(columns=old2new)

display(scraped_protac_df.head())
print(f'PROTAC-DB scraped len: {len(scraped_protac_df)}')
print(f'Number of non-NaN DC50 values: {scraped_protac_df["DC50 (nM)"].notna().sum()}')
print(f'Number of non-NaN Dmax values: {scraped_protac_df["Dmax (%)"].notna().sum()}')
print(f'Number of non-NaN DC50 and Dmax values: {len(scraped_protac_df[scraped_protac_df["DC50 (nM)"].notna() & scraped_protac_df["Dmax (%)"].notna()])}')
scraped_protac_df[scraped_protac_df["DC50 (nM)"].notna() & scraped_protac_df["Dmax (%)"].notna()]

Unnamed: 0,Compound ID,Target,Percent degradation (%),Article DOI,DC50 (nM),Dmax (%),Assay (DC50/Dmax),Assay (Percent degradation)
0,1,BRD7,20/12 (WB),10.1021/acs.jmedchem.8b01413,,,,% BRD7 degradation in HeLa cells after 4/16 h ...
1,1,BRD9,19/30 (WB),10.1021/acs.jmedchem.8b01413,,,,% BRD9 degradation in HeLa cells after 4/16 h ...
2,2,BRD7,19/27 (WB),10.1021/acs.jmedchem.8b01413,,,,% BRD7 degradation in HeLa cells after 4/16 h ...
3,2,BRD9,5/21 (WB),10.1021/acs.jmedchem.8b01413,,,,% BRD9 degradation in HeLa cells after 4/16 h ...
4,3,BRD9,94/93 (WB),10.1021/acs.jmedchem.8b01413,,,,% BRD9 degradation in HeLa cells after 4/16 h ...


PROTAC-DB scraped len: 6000
Number of non-NaN DC50 values: 959
Number of non-NaN Dmax values: 783
Number of non-NaN DC50 and Dmax values: 374


Unnamed: 0,Compound ID,Target,Percent degradation (%),Article DOI,DC50 (nM),Dmax (%),Assay (DC50/Dmax),Assay (Percent degradation)
19,11,BRD9,,10.1021/acs.jmedchem.8b01413,560,80,Degradation of BRD9 in HeLa cells after 4 h tr...,
40,22,BRD9,,10.1021/acs.jmedchem.8b01413,1.76,95,Degradation of BRD9 in RI-1 cells after 8 h tr...,
42,22,BRD7,,10.1021/acs.jmedchem.8b01413,4.5,95,Degradation of BRD7 in RI-1 cells after 8 h tr...,
104,118,AR,76/>95/>99,10.1021/acs.jmedchem.9b01393,7.2/1,>95/>95,Degradation of AR in LNCaP/VCaP AR+ cells afte...,% AR degradation in LNCaP cells after 6 h trea...
106,120,AR,89/99/100,10.1021/acs.jmedchem.8b01631,0.86/0.76/10.4,>95/>95/>95,Degradation of AR in LNCaP/VCaP/22Rv1 cells af...,% AR degradation in LNCaP cells after 6 h trea...
...,...,...,...,...,...,...,...,...
5901,3218,BCL-xL,,10.1038/s41467-021-27210-x,12.1,81,Degradation of BCL-xL in 293T cells after 16 h...,
5902,3218,BCL2,,10.1038/s41467-021-27210-x,354.8,69,Degradation of BCL2 in 293T cells after 16 h t...,
5950,3217,BCL-xL,,10.1038/s41467-021-27210-x,12.8,96,Degradation of BCL-xL in 293T cells after 16 h...,
5951,3218,BCL-xL,,10.1038/s41467-021-27210-x,12.1,81,Degradation of BCL-xL in 293T cells after 16 h...,


In [8]:
# Merge scraped data with PROTAC-DB on Compound ID and get non-assay columns
param_cols = [
    "Percent degradation (%)",
    "Dmax (%)",
    "DC50 (nM)",
    "Assay (Percent degradation)",
    "Assay (DC50/Dmax)",
]
cols = [c for c in protac_df.columns if c not in param_cols]
on_cols = [c for c in scraped_protac_df.columns if c not in param_cols]
scraped_protac_df = protac_df[cols].merge(
    scraped_protac_df,
    on=on_cols,
).drop_duplicates()
print(f'Merged df len: {len(scraped_protac_df)}')
scraped_protac_df.head()

Merged df len: 5343


Unnamed: 0,Compound ID,Uniprot,Target,E3 Ligase,PDB,Name,Smiles,"IC50 (nM, Protac to Target)","Assay (Protac to Target, IC50)","EC50 (nM, Protac to Target)",...,Rotatable Bond Count,Topological Polar Surface Area,Molecular Formula,InChI,InChI Key,Percent degradation (%),DC50 (nM),Dmax (%),Assay (DC50/Dmax),Assay (Percent degradation)
0,1,Q9NPI1,BRD7,VHL,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,,,,...,19,189.92,C50H64N8O9S,InChI=1S/C50H64N8O9S/c1-32-45(68-31-53-32)34-1...,RPMQBLMPGMFXLD-PDUNVWSESA-N,20/12 (WB),,,,% BRD7 degradation in HeLa cells after 4/16 h ...
1,1,Q9H8M2,BRD9,VHL,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,,,,...,19,189.92,C50H64N8O9S,InChI=1S/C50H64N8O9S/c1-32-45(68-31-53-32)34-1...,RPMQBLMPGMFXLD-PDUNVWSESA-N,19/30 (WB),,,,% BRD9 degradation in HeLa cells after 4/16 h ...
2,2,Q9NPI1,BRD7,VHL,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,,,,...,25,208.38,C54H72N8O11S,InChI=1S/C54H72N8O11S/c1-36-49(74-35-57-36)38-...,NGWWVKZONFCNQP-SHPBXJAASA-N,19/27 (WB),,,,% BRD7 degradation in HeLa cells after 4/16 h ...
3,2,Q9H8M2,BRD9,VHL,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,,,,...,25,208.38,C54H72N8O11S,InChI=1S/C54H72N8O11S/c1-36-49(74-35-57-36)38-...,NGWWVKZONFCNQP-SHPBXJAASA-N,5/21 (WB),,,,% BRD9 degradation in HeLa cells after 4/16 h ...
4,3,Q9H8M2,BRD9,CRBN,,,COC1=CC(C2=CN(C)C(=O)C3=CN=CC=C23)=CC(OC)=C1CN...,,,,...,18,202.97,C43H50N8O10,InChI=1S/C43H50N8O10/c1-48-24-31(28-9-10-44-23...,RMBNUDOJPQLHMV-UHFFFAOYSA-N,94/93 (WB),,,,% BRD9 degradation in HeLa cells after 4/16 h ...


In [9]:
print(len(scraped_protac_df.dropna(
    subset=['DC50 (nM)', 'Dmax (%)']).dropna(how='all')))
print(len(scraped_protac_df.dropna(
    subset=['Percent degradation (%)']).dropna(how='all')))

tmp = scraped_protac_df.dropna(subset=['Percent degradation (%)'])

tmp[tmp['Percent degradation (%)'].str.contains(
    'WB')][['Percent degradation (%)', 'Assay (Percent degradation)']].drop_duplicates()

362
737


Unnamed: 0,Percent degradation (%),Assay (Percent degradation)
0,20/12 (WB),% BRD7 degradation in HeLa cells after 4/16 h ...
1,19/30 (WB),% BRD9 degradation in HeLa cells after 4/16 h ...
2,19/27 (WB),% BRD7 degradation in HeLa cells after 4/16 h ...
3,5/21 (WB),% BRD9 degradation in HeLa cells after 4/16 h ...
4,94/93 (WB),% BRD9 degradation in HeLa cells after 4/16 h ...
...,...,...
4861,54 (WB),% JAK3 degradation in MHH-CALL-4 cells at 100 nM
4863,42 (WB),% JAK3 degradation in MHH-CALL-4 cells at 100 nM
4864,1/32/69/19 (WB),% JAK2 degradation in MHH-CALL-4 cells at 1/10...
4881,0/30/30/53/29 (WB),% EGFR del19 degradation in HCC827 cells at 5/...


In [10]:
df_file = os.path.join(data_dir, 'PROTAC-Pedia.csv')
protac_pedia_df = pd.read_csv(df_file)
print(f'protac_pedia_df len: {len(protac_pedia_df)}')
protac_pedia_df.head()

protac_pedia_df len: 1203


Unnamed: 0,PROTACDB ID,PROTAC SMILES,Active/Inactive,Best PROTAC,Cells,cLogP,Comments,Curator,Dc50,Dmax,...,Proteomics Data Available,Secondary Pubmed,Status,Target,Tested A Non Binding E3 Control,Tested Competition With Ligand,Tested Engagement In Cells,Tested Proteaseome Inhibitor,Time,TPSA
0,1,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,Inactive,No,MOLT-4,10.83732,IC50's are for cell viability assays,Ronen Gabizon,,,...,No,,Reviewed,Q07817,No,No,No,No,48,251.07
1,2,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,Inactive,No,MOLT-4,11.22742,IC50's are for cell viability assays,Ronen Gabizon,,,...,No,,Reviewed,Q07817,No,No,No,No,48,251.07
2,3,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,Inactive,No,MOLT-4,11.61752,IC50's are for cell viability assays,Ronen Gabizon,,,...,No,,Reviewed,Q07817,No,No,No,No,48,251.07
3,4,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,Active,No,MOLT-4,12.00762,IC50's are for cell viability assays,Ronen Gabizon,,,...,No,,Reviewed,Q07817,No,No,No,No,48,251.07
4,5,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,Active,No,MOLT-4,12.39772,IC50's are for cell viability assays,Ronen Gabizon,53 nM,~ 100 %,...,No,,Reviewed,Q07817,No,No,Yes,No,48,251.07


## Get Article Data from PubMed

In [29]:
# Get unique article DOIs from PROTAC-DB for which the Assay (DC50/Dmax) is NaN
unique_dois = protac_df[protac_df['Assay (DC50/Dmax)'].isna() & protac_df['Assay (Percent degradation)'].isna()][['Article DOI']].drop_duplicates()
print(f'unique_dois len: {len(unique_dois)}')
unique_dois.head()

unique_dois len: 233


Unnamed: 0,Article DOI
0,10.1021/acs.jmedchem.8b01413
71,10.1021/acs.jmedchem.9b00455
72,10.1021/acs.jmedchem.8b00909
92,10.1021/acs.jmedchem.8b01572
164,10.1021/acs.jmedchem.6b01816


Convert article DOIs to PubMed IDs:

In [30]:
import requests

def doi_to_pmcid(dois):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
    pmc_ids = []

    for doi in dois:
        # Step 1: Use esearch to find the PMID for the DOI
        search_url = f"{base_url}esearch.fcgi?db=pubmed&term={doi}&retmode=json"
        search_response = requests.get(search_url)
        search_data = search_response.json()
        pmid = search_data['esearchresult']['idlist'][0] if search_data['esearchresult']['idlist'] else None

        if not pmid:
            print(f"No PMID found for DOI {doi}.")
            pmc_ids.append(None)
            continue

        # Step 2: Use elink to find the PMC ID for the PMID
        link_url = f"{base_url}elink.fcgi?dbfrom=pubmed&db=pmc&id={pmid}&retmode=json"
        link_response = requests.get(link_url)
        link_data = link_response.json()
        pmc_id = None
        for linksetdb in link_data["linksets"][0].get("linksetdbs", []):
            if linksetdb["dbto"] == "pmc":
                pmc_id = linksetdb["links"][0]
                break

        if pmc_id:
            print(f"PMC ID for DOI {doi} is PMC{pmc_id}.")
        else:
            print(f"No PMC ID found for DOI {doi}.")
        
        pmc_ids.append(pmc_id if pmc_id else None)

    return pmc_ids

# Example usage
dois = [
    '10.1021/acs.jmedchem.8b01413',
    '10.1021/acs.jmedchem.9b00455',
    '10.1021/acs.jmedchem.8b00909',
    '10.1021/acs.jmedchem.8b01572',
    '10.1021/acs.jmedchem.6b01816',
]

pmc_ids = doi_to_pmcid(dois)

PMC ID for DOI 10.1021/acs.jmedchem.8b01413 is PMC6348446.
PMC ID for DOI 10.1021/acs.jmedchem.9b00455 is PMC10688117.
PMC ID for DOI 10.1021/acs.jmedchem.8b00909 is PMC6545112.
PMC ID for DOI 10.1021/acs.jmedchem.8b01572 is PMC10788944.
PMC ID for DOI 10.1021/acs.jmedchem.6b01816 is PMC5788414.


Fetch article XML data from PubMed:

In [81]:
from Bio import Entrez

def fetch_article(pmc_id):
    Entrez.email = "your.email@example.com"  # Provide your email here
    handle = Entrez.efetch(db="pmc", id=pmc_id, retmode="xml")
    article_xml = handle.read()  # Read raw XML content
    handle.close()
    return article_xml

# Example usage
pmc_id = f"PMC{pmc_ids[0]}"  # Replace with an actual PMC ID
article_xml = fetch_article(pmc_id)
article_xml

b'<?xml version="1.0" ?>\n<!DOCTYPE pmc-articleset PUBLIC "-//NLM//DTD ARTICLE SET 2.0//EN" "https://dtd.nlm.nih.gov/ncbi/pmc/articleset/nlm-articleset-2.0.dtd">\n<pmc-articleset><article xmlns:mml="http://www.w3.org/1998/Math/MathML" article-type="research-article" xml:lang="EN">\n  <?properties open_access?>\n  <front>\n    <journal-meta>\n      <journal-id journal-id-type="nlm-ta">J Med Chem</journal-id>\n      <journal-id journal-id-type="iso-abbrev">J. Med. Chem</journal-id>\n      <journal-id journal-id-type="publisher-id">jm</journal-id>\n      <journal-id journal-id-type="coden">jmcmar</journal-id>\n      <journal-title-group>\n        <journal-title>Journal of Medicinal Chemistry</journal-title>\n      </journal-title-group>\n      <issn pub-type="ppub">0022-2623</issn>\n      <issn pub-type="epub">1520-4804</issn>\n      <publisher>\n        <publisher-name>American Chemical\nSociety</publisher-name>\n      </publisher>\n    </journal-meta>\n    <article-meta>\n      <article

Parse article XML data into a Python dictionary:

In [127]:
from lxml import etree

def format_references(sup_element):
    """Format reference citations found within <sup> tags."""
    references = sup_element.xpath('.//xref[@ref-type="bibr"]/text()')
    # Join multiple references with commas and enclose in brackets
    return '[' + ', '.join(references) + ']'

def parse_paragraph(paragraph):
    """Parse a paragraph element, combining text nodes and references."""
    text_segments = []
    for node in paragraph.iterchildren():
        if node.tag == 'sup':
            text_segments.append(format_references(node))
        else:
            # Directly append text from other tags if needed
            text_segments.append(node.xpath('string()'))
    # Prepend any leading text in the paragraph before the first child element
    leading_text = paragraph.xpath('text()')[0] if paragraph.xpath('text()') else ''
    full_text = leading_text + ''.join(text_segments)
    return full_text

def parse_section(section):
    """Recursively parse a section to extract its title, text, and any subsections."""
    section_data = {
        'title': "".join(section.xpath('.//title/text()')),
        'text': "".join(section.xpath('.//p//text()')),
        # 'text': ' '.join([parse_paragraph(p) for p in section.xpath('.//p')]),
    }
    
    # Process any subsections recursively
    subsections = section.xpath('.//sec')
    for subsection in subsections:
        subsection_title = "".join(subsection.xpath('./title/text()')).strip()
        section_data[subsection_title] = parse_section(subsection)
    
    return section_data

def parse_article_data(article_xml):
    root = etree.fromstring(article_xml)
    # Use correct XPaths based on the XML namespace if applicable
    title = root.xpath('//article-meta//article-title/text()')
    abstract = " ".join(root.xpath('//abstract//p/text()'))
    keywords = root.xpath('//kwd-group/kwd/text()')
    year = root.xpath('//article-meta//pub-date/year/text()')
    
    # Adjust XPath expressions based on actual XML structure
    # sections = ["".join(sec.xpath('.//text()')) for sec in root.xpath('//body//sec')]
    sections = [parse_section(sec) for sec in root.xpath('//body/sec')]

    tables = [{"caption": "".join(table.xpath('.//caption//title//text()')), "content": etree.tostring(table)} for table in root.xpath('//body//table-wrap')]
    figures = [{"caption": "".join(fig.xpath('.//caption//p//text()')), "content": etree.tostring(fig)} for fig in root.xpath('//body//fig')]
    
    return {
        "title": title,
        "abstract": abstract,
        "keywords": keywords,
        "year": year,
        "sections": sections,
        "tables": tables,
        "figures": figures
    }


# Use the parse_article_data function on the fetched data
article_info = parse_article_data(article_xml)
# for k, v in article_info.items():
#     print(f"{k}: {v}\n")
#     # if k == 'sections':
#     #     print(''.join(v))
#     print('='*100 + '\n')

print(f'number of sections: {len(article_info["sections"])}')
for sec in article_info["sections"]:
    print(f'section title: {sec["title"]}')
    print('-'*80)
    print(f'section text: {sec["text"][:500]}')
    print('-'*80)
    print(f'number of subsections: {len(sec) - 2}')
    for subsec in sec:
        if subsec not in ['title', 'text']:
            print('-'*80)
            print(f'\tsubsection title: {subsec}')
            print(f'\tsubsection text: {sec[subsec]["text"][:500]}')
    print('='*100 + '\n')

number of sections: 4
section title: Introduction
--------------------------------------------------------------------------------
section text: Targeted
protein degradation is an emerging strategy to use small
molecules to knock down a protein by hijacking the ubiquitin–proteasome
system.1,2 PROTACs (proteolysis targeting chimeras)
are bifunctional degrader molecules composed of a ligand for the target
protein and a ligand for E3 ligase recruitment, connected by a linker.3,4 Upon formation of a ternary complex target:degrader:E3,5−7 the protein of interest is ubiquitinated and degraded by the proteasome.
Compared to target blockade, pos
--------------------------------------------------------------------------------
number of subsections: 0

section title: Results and DiscussionFirst Generation of BRD7 and BRD9 DegradersSynthesis
of the BRD7/9 Ligand Synthesis of the
First Generation of DegradersThermodynamic Parameters of Formation
of Binary and Ternary Complexes between VCB, BRD9 Br

In [124]:
from IPython.display import display_html

print(f'Tables: {len(article_info["tables"])}')
for table in article_info['tables']:
    print(table['caption'], '\n')
    print(table['content'])
    print('='*100 + '\n')

print(f'Figures: {len(article_info["figures"])}')
for fig in article_info['figures']:
    print(fig['caption'], '\n')
    print(fig['content'])
    print('='*100 + '\n')

Tables: 4
Thermodynamic Parameters of Formation
of Binary and Ternary Complexes between VCB, BRD9 Bromodomain, and
Compound 5 Measured by Isothermal Titration Calorimetry
(ITC)a 

b'<table-wrap xmlns:mml="http://www.w3.org/1998/Math/MathML" id="tbl1" position="float">\n          <label>Table 1</label>\n          <caption>\n            <title>Thermodynamic Parameters of Formation\nof Binary and Ternary Complexes between VCB, BRD9 Bromodomain, and\nCompound <bold>5</bold> Measured by Isothermal Titration Calorimetry\n(ITC)<xref rid="t1fn1" ref-type="table-fn">a</xref></title>\n          </caption>\n          <table frame="hsides" rules="groups" border="0">\n            <colgroup>\n              <col align="left"/>\n              <col align="left"/>\n              <col align="left"/>\n              <col align="left"/>\n              <col align="left"/>\n              <col align="left"/>\n              <col align="left"/>\n              <col align="left"/>\n            </colgroup>\n       

## Deprecated

In [61]:
def get_all_text(elem, namespaces):
    """
    Recursively retrieve all text within an element, including nested tags.
    Text within <sup> tags is included in brackets.
    :param elem: The current element to process
    :param namespaces: Dictionary of XML namespaces
    """
    text = elem.text if elem.text is not None else ""
    for child in elem:
        # Extract local tag name ignoring namespace
        tag = child.tag.split('}')[-1]  # Splits on '}' and takes the last part, which is the local tag name
        if tag == 'sup':
            text += "[" + get_all_text(child, namespaces) + "]"  # Process <sup> content with brackets
        else:
            text += get_all_text(child, namespaces)
        if child.tail is not None:
            text += child.tail
    # Replace ".[<some text>]" with " [<some text>]." in one go. Both brachets and dot are preserved.
    text = re.sub(r'\.\[([^\]]+)\]', r' [\1].', text)
    text = re.sub(r',\[([^\]]+)\]', r' [\1],', text)
    return text

def parse_article_xml(article_xml):
    # Define the namespaces used in the XML document
    namespaces = {
        'oai': 'http://www.openarchives.org/OAI/2.0/',
        'jats': 'https://jats.nlm.nih.gov/ns/archiving/1.3/'
    }

    # Find the body element
    # Note: Adjust the XPath expression based on the actual structure and namespaces
    body = article_xml.find('.//jats:body', namespaces)

    if not body:
        print("No body found in the XML.")
        return None

    # Initialize a dictionary to hold section titles and their corresponding text
    sections = {'body': {}, 'back': {}, 'front': {}}
    # Iterate over each section in the XML
    for sec in body.findall('.//jats:sec', namespaces):
        # Extract the section title
        title = sec.find('.//jats:title', namespaces).text if sec.find('.//jats:title', namespaces) is not None else "No Title"
        # Extract and concatenate all paragraph texts within this section
        paragraphs = sec.findall('.//jats:p', namespaces)
        section_text = "\n\n".join([get_all_text(p, namespaces) for p in paragraphs if p.text])
        
        # Add this section's text to the dictionary
        sections['body'][title] = section_text
    
    back = article_xml.find('.//jats:back', namespaces)
    if back:
        for sec in back.findall('.//jats:sec', namespaces):
            title = sec.find('.//jats:title', namespaces).text if sec.find('.//jats:title', namespaces) is not None else "No Title"
            paragraphs = sec.findall('.//jats:p', namespaces)
            section_text = "\n\n".join([get_all_text(p, namespaces) for p in paragraphs if p.text])
            sections['back'][title] = section_text
    return sections


article_text = parse_article_xml(article_xml)
for sec, text in article_text['body'].items():
    print(f"Section: {sec}\n\n{text[:500]}...\n")
    print("="*100 + "\n")

Section: Introduction

Targeted
protein degradation is an emerging strategy to use small
molecules to knock down a protein by hijacking the ubiquitin–proteasome
system [1,2]. PROTACs (proteolysis targeting chimeras)
are bifunctional degrader molecules composed of a ligand for the target
protein and a ligand for E3 ligase recruitment, connected by a linker [3,4]. Upon formation of a ternary complex target:degrader:E3 [5−7], the protein of interest is ubiquitinated and degraded by the proteasome.
Compared to target bloc...


Section: Results and Discussion

We began
our investigation by designing a small set of PROTACs aimed to induce
BRD7/9 degradation by recruiting three different E3 ubiquitin ligases:
VHL, CRBN, and DCAF15 [59,60]. We aimed to leverage available
E3 ligase ligands and to maximize the opportunity for complementary
surfaces between the bromodomain and the ligase within the ternary
complex. As BRD7/9 bromodomain ligands, we selected compounds 1a,b (Figure 1) [56], on the 