Spaces:

philipp-zettl
/

qa-generator

Sleeping

App Files Files Community

philipp-zettl commited on Jun 14, 2024

Commit

d7eff13

verified ·

1 Parent(s): 082fc10

Upload 2 files

Browse files

Files changed (2) hide show

optimization.py +66 -0
text.py +130 -0

optimization.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from collections import Counter
+from itertools import chain
+import math
+import torch
+from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
+def ngrams(sequence, n):
+    return [tuple(sequence[i:i+n]) for i in range(len(sequence)-n+1)]
+def count_ngrams(sequence, max_n):
+    counts = Counter()
+    for n in range(1, max_n + 1):
+        counts.update(ngrams(sequence, n))
+    return counts
+def self_bleu(outputs):
+    smoothing_function = SmoothingFunction().method1
+    scores = []
+    for i in range(len(outputs)):
+        references = outputs[:i] + outputs[i+1:]
+        # Avoid calculating BLEU score for empty references
+        if references:
+            scores.append(sentence_bleu(references, outputs[i], smoothing_function=smoothing_function))
+    # If all references are empty, return a default value
+    if not scores:
+        return 0
+    return sum(scores) / len(scores)
+def dist_n(outputs, n):
+    all_ngrams = list(chain(*[ngrams(output, n) for output in outputs]))
+    unique_ngrams = set(all_ngrams)
+    return len(unique_ngrams) / len(all_ngrams) if all_ngrams else 0
+def perplexity(model, tokenizer, texts):
+    encodings = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
+    max_length = model.config.n_positions
+    stride = 512
+    lls = []
+    for i in range(0, encodings.input_ids.size(1), stride):
+        begin_loc = max(i + stride - max_length, 0)
+        end_loc = i + stride
+        trg_len = end_loc - i
+        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(model.device)
+        target_ids = input_ids.clone()
+        target_ids[:, :-trg_len] = -100
+        with torch.no_grad():
+            outputs = model(input_ids, labels=target_ids)
+            log_likelihood = outputs.loss * trg_len
+        lls.append(log_likelihood)
+    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
+    return ppl.item()
+def js_divergence(p, q):
+    def kl_divergence(p, q):
+        return sum(p[i] * math.log(p[i] / q[i]) for i in range(len(p)) if p[i] != 0 and q[i] != 0)
+    p_norm = [float(i)/sum(p) for i in p]
+    q_norm = [float(i)/sum(q) for i in q]
+    m = [(p_norm[i] + q_norm[i]) / 2 for i in range(len(p_norm))]
+    return (kl_divergence(p_norm, m) + kl_divergence(q_norm, m)) / 2

text.py ADDED Viewed

	@@ -0,0 +1,130 @@

+from markdownify import markdownify as md
+from bs4 import BeautifulSoup as BS
+from IPython.display import display, Markdown
+from urllib.parse import urljoin
+from newspaper import Article
+import re
+import markdown
+def clean(s):
+    s = s.replace("\t", "\\t")
+    s = s.replace("\n", "\\n")
+    return s
+class DocTree:
+    def __init__(self, content):
+        self.content = content
+        self.max_depth = 6
+    def get_sections(self, *location_ids):
+        out = self.content
+        for id_ in location_ids:
+            out = out[id_]
+        return out
+    def merge_sections(self, elems):
+        if not isinstance(elems[0], list):
+            return '\n\n '.join(elems)
+        out = []
+        for e in elems:
+            out.append(self.merge_sections(e))
+        return '\n\n '.join(map(clean, out))
+    def get_merged_sections(self, *location_ids):
+        return [self.merge_sections(s) for s in self.get_sections(*location_ids)]
+    def as_markdown(self, content):
+        return md(content)
+    def get_sections_by_depth(self, depth):
+        return self._get_sections_by_depth(self.content, depth)
+    @staticmethod
+    def _get_sections_by_depth(content, depth):
+        """Returns a list of merged sections at a specific depth"""
+        if depth == 0:
+            return content
+        out = []
+        for elem in content:
+            out += DocTree._get_sections_by_depth(elem, depth - 1)
+        return out
+def fix_relative_links(url, article_content):
+    if 'http' in url:
+        base_url = '/'.join(url.split('/')[:3])
+    else:
+        base_url = url.split('/')
+    pat = re.compile(r'\[(.*?)\]\((.*?)\)', flags=re.IGNORECASE)
+    res = pat.findall(article_content)
+    if res:
+        for g in res:
+            url = urljoin(base_url, g[1]) if g[1].startswith('/') else g[1]
+            article_content = article_content.replace(f'[{g[0]}]({g[1]})', f'[{g[0]}]({url})')
+    else:print('not found')
+    return article_content
+def extract_article(url):
+    article = Article(url)
+    article.download()
+    article.parse()
+    return article
+def select_content(html_code, elem_class, class_name):
+    print(f'Calling select_content with {elem_class}, {class_name}')
+    if class_name.startswith('.'):
+        class_name = class_name[1:]
+        elem_id = None
+    elif class_name.startswith('#'):
+        elem_id = class_name[1:]
+        class_name = None
+    else:
+        elem_id = None
+        class_name = None
+    return md(str(BS(html_code, features="lxml").find(elem_class, class_=class_name, id=elem_id)))
+def split_by_heading(html_content, _i):
+    if _i >= 7:
+        return html_content
+    elems = []
+    for idx, elem in enumerate([i for i in html_content.split(f'<h{_i}') if i]):
+        if idx > 0 or elem.startswith('>'):
+            elem = f'<h{_i}{elem}'
+        elems.append(split_by_heading(elem, _i+1))
+    return elems
+def doctree_from_url(url, elem_class='div', class_name='article-body'):
+    article = extract_article(url)
+    # convert to MD to handle splitting better
+    article_content = select_content(article.html, elem_class, class_name)
+    article_content = (f"# {article.title}\n\n" + article_content).replace('\n\n', '\n').replace('#', '%%@@%%')
+    # fix relative website links
+    article_content = fix_relative_links(url, article_content)
+    # convert back to HTML
+    html_content = markdown.markdown(article_content).replace('%%@@%%', '#')
+    doc_tree = DocTree(split_by_heading(html_content, 1))
+    #assert doc_tree.merge_sections(doc_tree.get_sections(0)).replace('\n', '').replace(html_content.replace('\n', ''), '') == '', 'Document inconsistent. Manual adjustments required.'
+    return doc_tree
+def get_selectors_for_class(url, elem_class):
+    article = extract_article(url)
+    html_content = article.html
+    soup = BS(html_content, features="lxml")
+    classes = set()
+    ids = set()
+    for elem in soup.find_all(elem_class):
+        if elem.get('class'):
+            for c in elem.get('class'):
+                classes |= {f".{c}"}
+        if elem.get('id'):
+            for c in elem.get('id'):
+                ids |= {f"#{c}"}
+    return ids | classes