mohamedrady commited on
Commit
c411d80
·
verified ·
1 Parent(s): 143fe08

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -17
app.py CHANGED
@@ -1,14 +1,13 @@
1
  import os
2
  import re
3
- import camel_tools
4
  from camel_tools.tokenizers.word import simple_word_tokenize
5
- from camel_tools.ner import NERecognizer
6
  import nltk
7
  import torch
8
  from collections import Counter
9
  from transformers import pipeline, AutoModel, AutoTokenizer
10
  import PyPDF2
11
  import gradio as gr
 
12
 
13
  # تحميل وتفعيل الأدوات المطلوبة
14
  nltk.download('punkt')
@@ -16,29 +15,42 @@ nltk.download('punkt')
16
  # التحقق من توفر GPU واستخدامه
17
  device = 0 if torch.cuda.is_available() else -1
18
 
19
- # تحميل نماذج التحليل اللغوي
20
- analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)
21
 
22
- # تحميل نموذج التعرف على الكيانات في camel_tools
23
- ner = NERecognizer.pretrained()
24
 
25
  # تحميل نماذج BERT، GPT2، ELECTRA، و AraBERT
26
- arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
27
- arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
 
 
 
 
 
 
 
 
 
28
 
29
- arabic_gpt2_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
30
- arabic_gpt2_model = AutoModel.from_pretrained("aubmindlab/aragpt2-base")
31
 
32
- arabic_electra_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/araelectra-base-discriminator")
33
- arabic_electra_model = AutoModel.from_pretrained("aubmindlab/araelectra-base-discriminator")
34
 
35
- arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
36
- arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")
37
 
38
- # دالة لتحليل النص باستخدام camel_tools
39
  def camel_ner_analysis(text):
 
 
40
  tokens = simple_word_tokenize(text)
41
- entities = ner.predict(tokens)
 
 
42
  entity_dict = {"PERSON": [], "LOC": [], "ORG": [], "DATE": []}
43
  for token, tag in zip(tokens, entities):
44
  if tag in entity_dict:
@@ -122,7 +134,7 @@ def extract_dialogues(text):
122
  # دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
123
  def analyze_and_complete(file_paths):
124
  results = []
125
- output_directory = os.getenv("SPACE_DIR", "/Volumes/CLOCKWORK T/clockworkspace/first pro")
126
 
127
  for file_path in file_paths:
128
  if file_path.endswith(".pdf"):
 
1
  import os
2
  import re
 
3
  from camel_tools.tokenizers.word import simple_word_tokenize
 
4
  import nltk
5
  import torch
6
  from collections import Counter
7
  from transformers import pipeline, AutoModel, AutoTokenizer
8
  import PyPDF2
9
  import gradio as gr
10
+ import openai
11
 
12
  # تحميل وتفعيل الأدوات المطلوبة
13
  nltk.download('punkt')
 
15
  # التحقق من توفر GPU واستخدامه
16
  device = 0 if torch.cuda.is_available() else -1
17
 
18
+ # إعداد التوكنات
19
+ openai.api_key = "sk-proj-62TDbO5KABSdkZaFPPD4T3BlbkFJkhqOYpHhL6OucTzNdWSU"
20
 
21
+ # تحميل نماذج التحليل اللغوي
22
+ analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device, use_auth_token=huggingface_token)
23
 
24
  # تحميل نماذج BERT، GPT2، ELECTRA، و AraBERT
25
+ arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic", use_auth_token=huggingface_token)
26
+ arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic", use_auth_token=huggingface_token)
27
+
28
+ arabic_gpt2_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base", use_auth_token=huggingface_token)
29
+ arabic_gpt2_model = AutoModel.from_pretrained("aubmindlab/aragpt2-base", use_auth_token=huggingface_token)
30
+
31
+ arabic_electra_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/araelectra-base-discriminator", use_auth_token=huggingface_token)
32
+ arabic_electra_model = AutoModel.from_pretrained("aubmindlab/araelectra-base-discriminator", use_auth_token=huggingface_token)
33
+
34
+ arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02", use_auth_token=huggingface_token)
35
+ arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02", use_auth_token=huggingface_token)
36
 
37
+ aragpt2_mega_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-mega", use_auth_token=huggingface_token)
38
+ aragpt2_mega_model = AutoModel.from_pretrained("aubmindlab/aragpt2-mega", use_auth_token=huggingface_token)
39
 
40
+ xlm_roberta_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large", use_auth_token=huggingface_token)
41
+ xlm_roberta_model = AutoModel.from_pretrained("xlm-roberta-large", use_auth_token=huggingface_token)
42
 
43
+ m2m100_tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M", use_auth_token=huggingface_token)
44
+ m2m100_model = AutoModel.from_pretrained("facebook/m2m100_418M", use_auth_token=huggingface_token)
45
 
46
+ # دالة لتحليل النص باستخدام arabert-ner من transformers
47
  def camel_ner_analysis(text):
48
+ tokenizer = AutoTokenizer.from_pretrained("camel-ai/arabert-ner", use_auth_token=huggingface_token)
49
+ model = AutoModel.from_pretrained("camel-ai/arabert-ner", use_auth_token=huggingface_token)
50
  tokens = simple_word_tokenize(text)
51
+ inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True)
52
+ outputs = model(**inputs)
53
+ entities = outputs.logits.argmax(dim=-1).squeeze().tolist()
54
  entity_dict = {"PERSON": [], "LOC": [], "ORG": [], "DATE": []}
55
  for token, tag in zip(tokens, entities):
56
  if tag in entity_dict:
 
134
  # دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
135
  def analyze_and_complete(file_paths):
136
  results = []
137
+ output_directory = "/Volumes/CLOCKWORK T/clockworkspace/first pro/out"
138
 
139
  for file_path in file_paths:
140
  if file_path.endswith(".pdf"):