Spaces:
Build error
Build error
File size: 3,821 Bytes
99b1da3 abcaca9 99b1da3 abcaca9 ce42613 99b1da3 abcaca9 99b1da3 6977cda abcaca9 6977cda abcaca9 6977cda abcaca9 6977cda abcaca9 6977cda 99b1da3 6977cda 338f4fe 6977cda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import string
from sumy.parsers import DocumentParser
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words
from transformers import Pipeline, BertTokenizer
class Summarizer:
DEFAULT_LANGUAGE = "english"
DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')
STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)
def __init__(self, pipeline: Pipeline):
self.pipeline = pipeline
stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE)
self.lsa_summarizer = LsaSummarizer(stemmer)
self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE)
@staticmethod
def sentence_list(summarized_sentences) -> list:
summarized_list = []
for sentence in summarized_sentences:
summarized_list.append(sentence._text)
return summarized_list
@staticmethod
def join_sentences(summary_sentences: list) -> str:
return " ".join([sentence for sentence in summary_sentences])
@staticmethod
def split_sentences_by_token_length(summary_sentences: list, max_token_length: int) -> list:
accumulated_lists = []
result_list = []
cumulative_token_length = 0
for sentence in summary_sentences:
result_list.append(sentence)
token_list = Summarizer.TOKENIZER.tokenize(sentence)
token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
token_length = len(token_words)
if token_length + cumulative_token_length >= max_token_length:
accumulated_lists.append(Summarizer.join_sentences(result_list))
result_list = []
cumulative_token_length = 0
else:
cumulative_token_length += token_length
if result_list:
accumulated_lists.append(Summarizer.join_sentences(result_list))
return accumulated_lists
def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
summarized_list = Summarizer.sentence_list(summarized_sentences)
return summarized_list
def extractive_summary_from_text(self, text: str, sentences_count: int) -> list:
parser = PlaintextParser.from_string(text, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
return self.__extractive_summary(parser, sentences_count)
def extractive_summary_from_url(self, url: str, sentences_count: int) -> list:
parser = HtmlParser.from_url(url, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
return self.__extractive_summary(parser, sentences_count)
def abstractive_summary(self, extract_summary_sentences: list) -> list:
"""
:param extract_summary_sentences: Extractive summary of sentences after Latent semantic analysis
:return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
"""
wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
max_token_length=1000)
# The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
abstractive_summary_list = []
for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
abstractive_summary_list.append(result['summary_text'])
return abstractive_summary_list
|