abdalrahmanshahrour commited on
Commit
5d711dd
·
1 Parent(s): 62f0e4d

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +55 -0
  2. requirements.txt +70 -0
  3. summarize.py +171 -0
app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from urllib.parse import unquote
2
+
3
+ import arabic_reshaper
4
+ import streamlit as st
5
+ from bidi.algorithm import get_display
6
+ from summarize import get_results
7
+
8
+
9
+ st.set_page_config(
10
+ page_title="Arabic Summarization",
11
+ page_icon="🤖",
12
+ layout="wide",
13
+ initial_sidebar_state="expanded",
14
+ menu_items={
15
+ 'Get Help': 'https://www.extremelycoolapp.com/help',
16
+ 'Report a bug': "https://www.extremelycoolapp.com/bug",
17
+ 'About': "# Arabic Text Summarizeation , abdalrahman shahrour",
18
+ }
19
+ )
20
+
21
+ rtl = lambda w: get_display(f"{arabic_reshaper.reshape(w)}")
22
+
23
+ st.header('Arabic Text Summarization')
24
+
25
+
26
+ st.markdown("")
27
+
28
+ model = st.sidebar.selectbox('Select one', ['arabartsummarization', 'AraBART', 'auto-arabic-summarization', 'BERT2BERT', 'xlmroberta2xlmroberta', 'nltk_summarizer'],help="Model",)
29
+ # ahmeddbahaa/xlmroberta2xlmroberta-finetune-summarization-ar
30
+ st.sidebar.write("\n")
31
+ num_beams = st.sidebar.slider(
32
+ "Number of beams", min_value=1, max_value=10, value=3, step=1
33
+ )
34
+
35
+ st.sidebar.write("\n")
36
+ length_penalty = st.sidebar.slider(
37
+ "Length penalty ", min_value=0.1, max_value=3.0, value=1.0, step=0.1,
38
+ )
39
+ number_of_sentence = st.sidebar.slider(
40
+ "Number of sentence", min_value=1, max_value=10, value=3, step=1
41
+ )
42
+
43
+
44
+
45
+ doc = st.text_area("Enter the text to be summarized",height=550,value=" شهدت مدينة طرابلس، مساء أمس الأربعاء، احتجاجات شعبية وأعمال شغب لليوم الثالث على التوالي، وذلك بسبب تردي الوضع المعيشي والاقتصادي. واندلعت مواجهات عنيفة وعمليات كر وفر ما بين الجيش اللبناني والمحتجين استمرت لساعات، إثر محاولة فتح الطرقات المقطوعة، ما أدى إلى إصابة العشرات من الطرفين.")
46
+
47
+ summarize_button = st.button(label="🧞‍♂️ summarize 🧞‍♂️")
48
+
49
+ if summarize_button:
50
+ with st.spinner("جاري التلخيص ..."):
51
+ result = get_results(doc, model, num_beams, length_penalty,number_of_sentence)
52
+ if len(result) > 0:
53
+ st.write(result)
54
+ else:
55
+ st.write("")
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ altair==4.2.0
2
+ arabert==1.0.1
3
+ arabic-reshaper==2.1.3
4
+ attrs==22.2.0
5
+ blinker==1.5
6
+ cachetools==5.2.0
7
+ certifi==2022.12.7
8
+ charset-normalizer==2.1.1
9
+ click==8.1.3
10
+ codetiming==1.3.0
11
+ commonmark==0.9.1
12
+ decorator==5.1.1
13
+ emoji==1.4.2
14
+ entrypoints==0.4
15
+ farasapy==0.0.14
16
+ filelock==3.8.2
17
+ future==0.18.2
18
+ gitdb==4.0.10
19
+ GitPython==3.1.29
20
+ huggingface-hub==0.11.1
21
+ idna==3.4
22
+ importlib-metadata==5.2.0
23
+ Jinja2==3.1.2
24
+ joblib==1.2.0
25
+ jsonschema==4.17.3
26
+ MarkupSafe==2.1.1
27
+ nltk==3.8
28
+ numpy==1.24.0
29
+ nvidia-cublas-cu11==11.10.3.66
30
+ nvidia-cuda-nvrtc-cu11==11.7.99
31
+ nvidia-cuda-runtime-cu11==11.7.99
32
+ nvidia-cudnn-cu11==8.5.0.96
33
+ packaging==22.0
34
+ pandas==1.5.2
35
+ Pillow==9.3.0
36
+ preprocess==2.0.0
37
+ protobuf==3.20.2
38
+ PyArabic==0.6.15
39
+ pyarrow==10.0.1
40
+ pydeck==0.8.0
41
+ Pygments==2.13.0
42
+ Pympler==1.0.1
43
+ pyrsistent==0.19.2
44
+ python-bidi==0.4.2
45
+ python-dateutil==2.8.2
46
+ pytz==2022.7
47
+ pytz-deprecation-shim==0.1.0.post0
48
+ PyYAML==6.0
49
+ regex==2022.10.31
50
+ requests==2.28.1
51
+ rich==12.6.0
52
+ semver==2.13.0
53
+ sentencepiece==0.1.97
54
+ six==1.16.0
55
+ smmap==5.0.0
56
+ streamlit==1.16.0
57
+ tokenizers==0.13.2
58
+ toml==0.10.2
59
+ toolz==0.12.0
60
+ torch==1.13.1
61
+ tornado==6.2
62
+ tqdm==4.64.1
63
+ transformers==4.25.1
64
+ typing_extensions==4.4.0
65
+ tzdata==2022.7
66
+ tzlocal==4.2
67
+ urllib3==1.26.13
68
+ validators==0.20.0
69
+ watchdog==2.2.0
70
+ zipp==3.11.0
summarize.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+ from functools import lru_cache
5
+ from urllib.parse import unquote
6
+
7
+ import streamlit as st
8
+ from codetiming import Timer
9
+ from transformers import pipeline
10
+ from arabert.preprocess import ArabertPreprocessor
11
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
12
+ import tokenizers
13
+ import re
14
+ import heapq
15
+ from string import punctuation
16
+ import nltk
17
+ from nltk.corpus import stopwords
18
+
19
+ punctuation = punctuation + '\n'
20
+ logger = logging.getLogger(__name__)
21
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
22
+
23
+ logger.info("Loading models...")
24
+ reader_time = Timer("loading", text="Time: {:.2f}", logger=logging.info)
25
+ reader_time.start()
26
+
27
+
28
+ reader_time.stop()
29
+
30
+
31
+ logger.info("Finished loading the models...")
32
+ logger.info(f"Time spent loading: {reader_time.last}")
33
+
34
+ @lru_cache(maxsize=200)
35
+ def get_results(text, model_selected, num_beams, length_penalty,number_of_sentence):
36
+ logger.info("\n=================================================================")
37
+ logger.info(f"Text: {text}")
38
+ logger.info(f"model_selected: {model_selected}")
39
+ logger.info(f"length_penalty: {length_penalty}")
40
+ reader_time = Timer("summarize", text="Time: {:.2f}", logger=logging.info)
41
+ reader_time.start()
42
+ if model_selected == 'GPT-2':
43
+ number_of_tokens_limit = 80
44
+ else:
45
+ number_of_tokens_limit = 150
46
+ logger.info(f"input length: {len(text.split())}")
47
+
48
+ if model_selected == 'arabartsummarization':
49
+ model_name="abdalrahmanshahrour/arabartsummarization"
50
+ preprocessor = ArabertPreprocessor(model_name="")
51
+
52
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
53
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
54
+ pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
55
+ result = pipeline1(text,
56
+ pad_token_id= tokenizer.eos_token_id,
57
+ num_beams=num_beams,
58
+ repetition_penalty=3.0,
59
+ max_length=200,
60
+ length_penalty=length_penalty,
61
+ no_repeat_ngram_size = 3)[0]['generated_text']
62
+ logger.info('arabartsummarization')
63
+ elif model_selected == 'AraBART':
64
+
65
+ model_name= "abdalrahmanshahrour/AraBART-summ"
66
+ preprocessor = ArabertPreprocessor(model_name="")
67
+
68
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
69
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
70
+ pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
71
+ result = pipeline1(text,
72
+ pad_token_id= tokenizer.eos_token_id,
73
+ num_beams=num_beams,
74
+ repetition_penalty=3.0,
75
+ max_length=200,
76
+ length_penalty=length_penalty,
77
+ no_repeat_ngram_size = 3)[0]['generated_text']
78
+ logger.info('AraBART')
79
+
80
+ elif model_selected == "auto-arabic-summarization":
81
+
82
+ model_name="abdalrahmanshahrour/auto-arabic-summarization"
83
+ preprocessor = ArabertPreprocessor(model_name="")
84
+
85
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
86
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
87
+ pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
88
+ result = pipeline1(text,
89
+ pad_token_id= tokenizer.eos_token_id,
90
+ num_beams=num_beams,
91
+ repetition_penalty=3.0,
92
+ max_length=200,
93
+ length_penalty=length_penalty,
94
+ no_repeat_ngram_size = 3)[0]['generated_text']
95
+ logger.info('auto-arabic-summarization')
96
+
97
+ elif model_selected == 'BERT2BERT':
98
+
99
+ model_name="malmarjeh/bert2bert"
100
+ preprocessor = ArabertPreprocessor(model_name="")
101
+
102
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
103
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
104
+ pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
105
+ result = pipeline1(text,
106
+ pad_token_id= tokenizer.eos_token_id,
107
+ num_beams=num_beams,
108
+ repetition_penalty=3.0,
109
+ max_length=200,
110
+ length_penalty=length_penalty,
111
+ no_repeat_ngram_size = 3)[0]['generated_text']
112
+ logger.info('BERT2BERT')
113
+
114
+ elif model_selected == "xlmroberta2xlmroberta":
115
+ model_name="ahmeddbahaa/xlmroberta2xlmroberta-finetune-summarization-ar"
116
+ preprocessor = ArabertPreprocessor(model_name="")
117
+
118
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
119
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
120
+ pipeline1 = pipeline("text2text-generation",model=model,tokenizer=tokenizer)
121
+ result = pipeline1(text,
122
+ pad_token_id= tokenizer.eos_token_id,
123
+ num_beams=num_beams,
124
+ repetition_penalty=3.0,
125
+ max_length=200,
126
+ length_penalty=length_penalty,
127
+ no_repeat_ngram_size = 3)[0]['generated_text']
128
+ logger.info('xlmroberta2xlmroberta')
129
+
130
+ elif model_selected == "nltk_summarizer":
131
+ # number_of_sentence = 3
132
+ stopWords = set(nltk.corpus.stopwords.words("arabic") + nltk.corpus.stopwords.words("english"))
133
+ word_frequencies = {}
134
+ for word in nltk.word_tokenize(text):
135
+ if word not in stopWords:
136
+ if word not in punctuation:
137
+ if word not in word_frequencies.keys():
138
+ word_frequencies[word] = 1
139
+ else:
140
+ word_frequencies[word] += 1
141
+
142
+ maximum_frequncy = max(list(word_frequencies.values()),default=3)
143
+
144
+ for word in word_frequencies.keys():
145
+ word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
146
+
147
+ sentence_list = nltk.sent_tokenize(text)
148
+ sentence_scores = {}
149
+ for sent in sentence_list:
150
+ for word in nltk.word_tokenize(sent.lower()):
151
+ if word in word_frequencies.keys():
152
+ if len(sent.split(' ')) < 30:
153
+ if sent not in sentence_scores.keys():
154
+ sentence_scores[sent] = word_frequencies[word]
155
+ else:
156
+ sentence_scores[sent] += word_frequencies[word]
157
+
158
+ summary_sentences = heapq.nlargest(number_of_sentence, sentence_scores, key=sentence_scores.get)
159
+
160
+ result = ' '.join(summary_sentences)
161
+ else:
162
+ result = "الرجاء اختيار نموذج"
163
+
164
+ reader_time.stop()
165
+ logger.info(f"Time spent summarizing: {reader_time.last}")
166
+
167
+ return result
168
+
169
+
170
+ if __name__ == "__main__":
171
+ results_dict = ""