Spaces:
Runtime error
Runtime error
File size: 4,089 Bytes
cc3e8a0 6e658c0 cc3e8a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
from markdownify import markdownify as md
from bs4 import BeautifulSoup as BS
from urllib.parse import urljoin
from newspaper import Article
import re
import markdown
def clean(s):
s = s.replace("\t", "\\t")
s = s.replace("\n", "\\n")
return s
class DocTree:
def __init__(self, content):
self.content = content
self.max_depth = 6
def get_sections(self, *location_ids):
out = self.content
for id_ in location_ids:
out = out[id_]
return out
def merge_sections(self, elems):
if not isinstance(elems[0], list):
return '\n\n '.join(elems)
out = []
for e in elems:
out.append(self.merge_sections(e))
return '\n\n '.join(map(clean, out))
def get_merged_sections(self, *location_ids):
return [self.merge_sections(s) for s in self.get_sections(*location_ids)]
def as_markdown(self, content):
return md(content)
def get_sections_by_depth(self, depth):
return self._get_sections_by_depth(self.content, depth)
@staticmethod
def _get_sections_by_depth(content, depth):
"""Returns a list of merged sections at a specific depth"""
if depth == 0:
return content
out = []
for elem in content:
out += DocTree._get_sections_by_depth(elem, depth - 1)
return out
def fix_relative_links(url, article_content):
if 'http' in url:
base_url = '/'.join(url.split('/')[:3])
else:
base_url = url.split('/')
pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE)
res = pat.findall(article_content)
if res:
for g in res:
url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
else:print('not found')
return article_content
def extract_article(url):
article = Article(url)
article.download()
article.parse()
return article
def select_content(html_code, elem_class, class_name):
print(f'Calling select_content with {elem_class}, {class_name}')
if class_name.startswith('.'):
class_name = class_name[1:]
elem_id = None
elif class_name.startswith('#'):
elem_id = class_name[1:]
class_name = None
else:
elem_id = None
class_name = None
return md(str(BS(html_code, features="lxml").find(elem_class, class_=class_name, id=elem_id)))
def split_by_heading(html_content, _i):
if _i >= 7:
return html_content
elems = []
for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
if idx > 0 or elem.startswith('>'):
elem = f'<h{_i}{elem}'
elems.append(split_by_heading(elem, _i+1))
return elems
def doctree_from_url(url, elem_class='div', class_name='article-body'):
article = extract_article(url)
# convert to MD to handle splitting better
article_content = select_content(article.html, elem_class, class_name)
article_content = (f"# {article.title}\n\n" + article_content).replace('\n\n', '\n').replace('#', '%%@@%%')
# fix relative website links
article_content = fix_relative_links(url, article_content)
# convert back to HTML
html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
doc_tree = DocTree(split_by_heading(html_content, 1))
#assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
return doc_tree
def get_selectors_for_class(url, elem_class):
article = extract_article(url)
html_content = article.html
soup = BS(html_content, features="lxml")
classes = set()
ids = set()
for elem in soup.find_all(elem_class):
if elem.get('class'):
for c in elem.get('class'):
classes |= {f".{c}"}
if elem.get('id'):
ids |= {f"#{elem.get('id')}"}
return ids | classes
|