|
import logging |
|
import os |
|
import re |
|
from functools import lru_cache |
|
from urllib.parse import unquote |
|
|
|
import streamlit as st |
|
from codetiming import Timer |
|
from transformers import pipeline |
|
from arabert.preprocess import ArabertPreprocessor |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM |
|
import tokenizers |
|
import re |
|
import heapq |
|
from string import punctuation |
|
import nltk |
|
from nltk.corpus import stopwords |
|
import download |
|
nltk.download('punkt') |
|
nltk.download('stopwords') |
|
nltk.download('wordnet') |
|
nltk.download('omw-1.4') |
|
|
|
|
|
punctuation = punctuation + '\n' |
|
logger = logging.getLogger(__name__) |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
logger.info("Loading models...") |
|
reader_time = Timer("loading", text="Time: {:.2f}", logger=logging.info) |
|
reader_time.start() |
|
|
|
|
|
reader_time.stop() |
|
|
|
|
|
logger.info("Finished loading the models...") |
|
logger.info(f"Time spent loading: {reader_time.last}") |
|
|
|
@lru_cache(maxsize=200) |
|
def get_results(text, model_selected, num_beams, length_penalty,number_of_sentence): |
|
logger.info("\n=================================================================") |
|
logger.info(f"Text: {text}") |
|
logger.info(f"model_selected: {model_selected}") |
|
logger.info(f"length_penalty: {length_penalty}") |
|
reader_time = Timer("summarize", text="Time: {:.2f}", logger=logging.info) |
|
reader_time.start() |
|
if model_selected == 'GPT-2': |
|
number_of_tokens_limit = 80 |
|
else: |
|
number_of_tokens_limit = 150 |
|
logger.info(f"input length: {len(text.split())}") |
|
|
|
if model_selected == 'arabartsummarization': |
|
model_name="abdalrahmanshahrour/arabartsummarization" |
|
preprocessor = ArabertPreprocessor(model_name="") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer) |
|
result = pipeline1(text, |
|
pad_token_id= tokenizer.eos_token_id, |
|
num_beams=num_beams, |
|
repetition_penalty=3.0, |
|
max_length=200, |
|
length_penalty=length_penalty, |
|
no_repeat_ngram_size = 3)[0]['generated_text'] |
|
logger.info('arabartsummarization') |
|
elif model_selected == 'AraBART': |
|
|
|
model_name= "abdalrahmanshahrour/AraBART-summ" |
|
preprocessor = ArabertPreprocessor(model_name="") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer) |
|
result = pipeline1(text, |
|
pad_token_id= tokenizer.eos_token_id, |
|
num_beams=num_beams, |
|
repetition_penalty=3.0, |
|
max_length=200, |
|
length_penalty=length_penalty, |
|
no_repeat_ngram_size = 3)[0]['generated_text'] |
|
logger.info('AraBART') |
|
|
|
elif model_selected == "auto-arabic-summarization": |
|
|
|
model_name="abdalrahmanshahrour/auto-arabic-summarization" |
|
preprocessor = ArabertPreprocessor(model_name="") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer) |
|
result = pipeline1(text, |
|
pad_token_id= tokenizer.eos_token_id, |
|
num_beams=num_beams, |
|
repetition_penalty=3.0, |
|
max_length=200, |
|
length_penalty=length_penalty, |
|
no_repeat_ngram_size = 3)[0]['generated_text'] |
|
logger.info('auto-arabic-summarization') |
|
|
|
elif model_selected == 'BERT2BERT': |
|
|
|
model_name="malmarjeh/bert2bert" |
|
preprocessor = ArabertPreprocessor(model_name="") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer) |
|
result = pipeline1(text, |
|
pad_token_id= tokenizer.eos_token_id, |
|
num_beams=num_beams, |
|
repetition_penalty=3.0, |
|
max_length=200, |
|
length_penalty=length_penalty, |
|
no_repeat_ngram_size = 3)[0]['generated_text'] |
|
logger.info('BERT2BERT') |
|
|
|
elif model_selected == "xlmroberta2xlmroberta": |
|
model_name="ahmeddbahaa/xlmroberta2xlmroberta-finetune-summarization-ar" |
|
preprocessor = ArabertPreprocessor(model_name="") |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer) |
|
result = pipeline1(text, |
|
pad_token_id= tokenizer.eos_token_id, |
|
num_beams=num_beams, |
|
repetition_penalty=3.0, |
|
max_length=200, |
|
length_penalty=length_penalty, |
|
no_repeat_ngram_size = 3)[0]['generated_text'] |
|
logger.info('xlmroberta2xlmroberta') |
|
|
|
elif model_selected == "nltk_summarizer": |
|
|
|
stopWords = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english")) |
|
word_frequencies = {} |
|
for word in nltk.word_tokenize(text): |
|
if word not in stopWords: |
|
if word not in punctuation: |
|
if word not in word_frequencies.keys(): |
|
word_frequencies[word] = 1 |
|
else: |
|
word_frequencies[word] += 1 |
|
|
|
maximum_frequncy = max(list(word_frequencies.values()),default=3) |
|
|
|
for word in word_frequencies.keys(): |
|
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy) |
|
|
|
sentence_list = nltk.sent_tokenize(text) |
|
sentence_scores = {} |
|
for sent in sentence_list: |
|
for word in nltk.word_tokenize(sent.lower()): |
|
if word in word_frequencies.keys(): |
|
if len(sent.split(' ')) < 30: |
|
if sent not in sentence_scores.keys(): |
|
sentence_scores[sent] = word_frequencies[word] |
|
else: |
|
sentence_scores[sent] += word_frequencies[word] |
|
|
|
summary_sentences = heapq.nlargest(number_of_sentence, sentence_scores, key=sentence_scores.get) |
|
|
|
result = ' '.join(summary_sentences) |
|
else: |
|
result = "الرجاء اختيار نموذج" |
|
|
|
reader_time.stop() |
|
logger.info(f"Time spent summarizing: {reader_time.last}") |
|
|
|
return result |
|
|
|
|
|
if __name__ == "__main__": |
|
results_dict = "" |