Spaces:

philipp-zettl
/

qa-generator

Sleeping

File size: 4,089 Bytes

from markdownify import markdownify as md
from bs4 import BeautifulSoup as BS
from urllib.parse import urljoin
from newspaper import Article
import re
import markdown


def clean(s):
    s = s.replace("\t", "\\t")
    s = s.replace("\n", "\\n")
    return s

class DocTree:
    def __init__(self, content):
        self.content = content
        self.max_depth = 6

    def get_sections(self, *location_ids):
        out = self.content
        for id_ in location_ids:
            out = out[id_]
        return out
    
    def merge_sections(self, elems):
        if not isinstance(elems[0], list):
            return '\n\n '.join(elems)
        out = []
        for e in elems:
            out.append(self.merge_sections(e))
        return '\n\n '.join(map(clean, out))

    def get_merged_sections(self, *location_ids):
        return [self.merge_sections(s) for s in self.get_sections(*location_ids)]

    def as_markdown(self, content):
        return md(content)

    def get_sections_by_depth(self, depth):
        return self._get_sections_by_depth(self.content, depth)

    @staticmethod
    def _get_sections_by_depth(content, depth):
        """Returns a list of merged sections at a specific depth"""
        if depth == 0:
            return content
        out = []
        for elem in content:
            out += DocTree._get_sections_by_depth(elem, depth - 1)
        return out


def fix_relative_links(url, article_content):
    if 'http' in url:
        base_url = '/'.join(url.split('/')[:3])
    else:
        base_url = url.split('/')
    pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE)
    res = pat.findall(article_content)
    if res:
        for g in res:
            url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
            article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
    else:print('not found')
    return article_content


def extract_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article


def select_content(html_code, elem_class, class_name):
    print(f'Calling select_content with {elem_class}, {class_name}')
    if class_name.startswith('.'):
        class_name = class_name[1:]
        elem_id = None
    elif class_name.startswith('#'):
        elem_id = class_name[1:]
        class_name = None
    else:
        elem_id = None
        class_name = None
    return md(str(BS(html_code, features="lxml").find(elem_class, class_=class_name, id=elem_id)))


def split_by_heading(html_content, _i):
    if _i >= 7:
        return html_content
    elems = []
    for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
        if idx > 0 or elem.startswith('>'):
            elem = f'<h{_i}{elem}'
        elems.append(split_by_heading(elem, _i+1))
    return elems

def doctree_from_url(url, elem_class='div', class_name='article-body'):
    article = extract_article(url)
    # convert to MD to handle splitting better
    article_content = select_content(article.html, elem_class, class_name)
    article_content = (f"# {article.title}\n\n" + article_content).replace('\n\n', '\n').replace('#', '%%@@%%')
    # fix relative website links
    article_content = fix_relative_links(url, article_content)
    # convert back to HTML
    html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
    doc_tree = DocTree(split_by_heading(html_content, 1))

    #assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
    return doc_tree


def get_selectors_for_class(url, elem_class):
    article = extract_article(url)

    html_content = article.html
    soup = BS(html_content, features="lxml")
    classes = set()
    ids = set()
    for elem in soup.find_all(elem_class):
        if elem.get('class'):
            for c in elem.get('class'):
                classes |= {f".{c}"}
        if elem.get('id'):
            ids |= {f"#{elem.get('id')}"}

    return ids | classes