File size: 3,821 Bytes
99b1da3
abcaca9
 
 
 
 
 
 
 
99b1da3
abcaca9
 
 
 
ce42613
99b1da3
 
abcaca9
 
 
 
 
 
 
 
 
 
 
 
 
 
99b1da3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6977cda
abcaca9
 
6977cda
abcaca9
6977cda
abcaca9
 
 
6977cda
abcaca9
 
 
6977cda
99b1da3
 
 
 
 
 
 
6977cda
338f4fe
6977cda
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import string

from sumy.parsers import DocumentParser
from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.nlp.stemmers import Stemmer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.utils import get_stop_words
from transformers import Pipeline, BertTokenizer


class Summarizer:
    DEFAULT_LANGUAGE = "english"
    DEFAULT_EXTRACTED_ARTICLE_SENTENCES_LENGTH = 10
    TOKENIZER = BertTokenizer.from_pretrained('bert-base-cased')
    STOP_WORDS = list(get_stop_words(language=DEFAULT_LANGUAGE)) + list(string.punctuation)

    def __init__(self, pipeline: Pipeline):
        self.pipeline = pipeline
        stemmer = Stemmer(Summarizer.DEFAULT_LANGUAGE)
        self.lsa_summarizer = LsaSummarizer(stemmer)
        self.lsa_summarizer.stop_words = get_stop_words(language=Summarizer.DEFAULT_LANGUAGE)

    @staticmethod
    def sentence_list(summarized_sentences) -> list:
        summarized_list = []
        for sentence in summarized_sentences:
            summarized_list.append(sentence._text)
        return summarized_list

    @staticmethod
    def join_sentences(summary_sentences: list) -> str:
        return " ".join([sentence for sentence in summary_sentences])

    @staticmethod
    def split_sentences_by_token_length(summary_sentences: list, max_token_length: int) -> list:
        accumulated_lists = []
        result_list = []
        cumulative_token_length = 0
        for sentence in summary_sentences:
            result_list.append(sentence)
            token_list = Summarizer.TOKENIZER.tokenize(sentence)
            token_words = [token for token in token_list if token.lower() not in Summarizer.STOP_WORDS]
            token_length = len(token_words)
            if token_length + cumulative_token_length >= max_token_length:
                accumulated_lists.append(Summarizer.join_sentences(result_list))
                result_list = []
                cumulative_token_length = 0
            else:
                cumulative_token_length += token_length
        if result_list:
            accumulated_lists.append(Summarizer.join_sentences(result_list))
        return accumulated_lists

    def __extractive_summary(self, parser: DocumentParser, sentences_count) -> list:
        summarized_sentences = self.lsa_summarizer(parser.document, sentences_count)
        summarized_list = Summarizer.sentence_list(summarized_sentences)
        return summarized_list

    def extractive_summary_from_text(self, text: str, sentences_count: int) -> list:
        parser = PlaintextParser.from_string(text, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
        return self.__extractive_summary(parser, sentences_count)

    def extractive_summary_from_url(self, url: str, sentences_count: int) -> list:
        parser = HtmlParser.from_url(url, Tokenizer(Summarizer.DEFAULT_LANGUAGE))
        return self.__extractive_summary(parser, sentences_count)

    def abstractive_summary(self, extract_summary_sentences: list) -> list:
        """
        :param extract_summary_sentences: Extractive summary of sentences after Latent semantic analysis
        :return: List of abstractive summary of sentences after calling distilbart-tos-summarizer-tosdr tokenizer
        """
        wrapped_sentences = Summarizer.split_sentences_by_token_length(extract_summary_sentences,
                                                                       max_token_length=1000)
        # The ml6team/distilbart-tos-summarizer-tosdr tokenizer supports a max of 1024 tokens per input
        abstractive_summary_list = []
        for result in self.pipeline(wrapped_sentences, min_length=5, max_length=512):
            abstractive_summary_list.append(result['summary_text'])
        return abstractive_summary_list