mohamedrady commited on
Commit
49fcff6
·
verified ·
1 Parent(s): 3b6e8c8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -25
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import re
3
  from camel_tools.tokenizers.word import simple_word_tokenize
 
4
  import nltk
5
  import torch
6
  from collections import Counter
@@ -8,6 +9,8 @@ from transformers import pipeline, AutoModel, AutoTokenizer
8
  import PyPDF2
9
  import gradio as gr
10
  import openai
 
 
11
 
12
  # تحميل وتفعيل الأدوات المطلوبة
13
  nltk.download('punkt')
@@ -15,42 +18,38 @@ nltk.download('punkt')
15
  # التحقق من توفر GPU واستخدامه
16
  device = 0 if torch.cuda.is_available() else -1
17
 
18
- # إعداد التوكنات
19
- openai.api_key = "sk-proj-62TDbO5KABSdkZaFPPD4T3BlbkFJkhqOYpHhL6OucTzNdWSU"
20
-
21
  # تحميل نماذج التحليل اللغوي
22
- analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device, use_auth_token=huggingface_token)
 
 
 
23
 
24
  # تحميل نماذج BERT، GPT2، ELECTRA، و AraBERT
25
- arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic", use_auth_token=huggingface_token)
26
- arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic", use_auth_token=huggingface_token)
27
 
28
- arabic_gpt2_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base", use_auth_token=huggingface_token)
29
- arabic_gpt2_model = AutoModel.from_pretrained("aubmindlab/aragpt2-base", use_auth_token=huggingface_token)
30
 
31
- arabic_electra_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/araelectra-base-discriminator", use_auth_token=huggingface_token)
32
- arabic_electra_model = AutoModel.from_pretrained("aubmindlab/araelectra-base-discriminator", use_auth_token=huggingface_token)
33
 
34
- arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02", use_auth_token=huggingface_token)
35
- arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02", use_auth_token=huggingface_token)
36
 
37
- aragpt2_mega_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-mega", use_auth_token=huggingface_token)
38
- aragpt2_mega_model = AutoModel.from_pretrained("aubmindlab/aragpt2-mega", use_auth_token=huggingface_token)
39
 
40
- xlm_roberta_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large", use_auth_token=huggingface_token)
41
- xlm_roberta_model = AutoModel.from_pretrained("xlm-roberta-large", use_auth_token=huggingface_token)
42
 
43
- m2m100_tokenizer = AutoTokenizer.from_pretrained("facebook/m2m100_418M", use_auth_token=huggingface_token)
44
- m2m100_model = AutoModel.from_pretrained("facebook/m2m100_418M", use_auth_token=huggingface_token)
45
 
46
- # دالة لتحليل النص باستخدام arabert-ner من transformers
47
  def camel_ner_analysis(text):
48
- tokenizer = AutoTokenizer.from_pretrained("camel-ai/arabert-ner", use_auth_token=huggingface_token)
49
- model = AutoModel.from_pretrained("camel-ai/arabert-ner", use_auth_token=huggingface_token)
50
  tokens = simple_word_tokenize(text)
51
- inputs = tokenizer(tokens, return_tensors="pt", is_split_into_words=True)
52
- outputs = model(**inputs)
53
- entities = outputs.logits.argmax(dim=-1).squeeze().tolist()
54
  entity_dict = {"PERSON": [], "LOC": [], "ORG": [], "DATE": []}
55
  for token, tag in zip(tokens, entities):
56
  if tag in entity_dict:
@@ -134,7 +133,7 @@ def extract_dialogues(text):
134
  # دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
135
  def analyze_and_complete(file_paths):
136
  results = []
137
- output_directory = "/Volumes/CLOCKWORK T/clockworkspace/first pro/out"
138
 
139
  for file_path in file_paths:
140
  if file_path.endswith(".pdf"):
 
1
  import os
2
  import re
3
  from camel_tools.tokenizers.word import simple_word_tokenize
4
+ from camel_tools.ner import NERecognizer
5
  import nltk
6
  import torch
7
  from collections import Counter
 
9
  import PyPDF2
10
  import gradio as gr
11
  import openai
12
+ from haystack.nodes import FARMReader
13
+ from paddlenlp import Taskflow
14
 
15
  # تحميل وتفعيل الأدوات المطلوبة
16
  nltk.download('punkt')
 
18
  # التحقق من توفر GPU واستخدامه
19
  device = 0 if torch.cuda.is_available() else -1
20
 
 
 
 
21
  # تحميل نماذج التحليل اللغوي
22
+ analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=device)
23
+
24
+ # تحميل نموذج التعرف على الكيانات في camel_tools
25
+ ner = NERecognizer.pretrained()
26
 
27
  # تحميل نماذج BERT، GPT2، ELECTRA، و AraBERT
28
+ arabic_bert_tokenizer = AutoTokenizer.from_pretrained("asafaya/bert-base-arabic")
29
+ arabic_bert_model = AutoModel.from_pretrained("asafaya/bert-base-arabic")
30
 
31
+ arabic_gpt2_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/aragpt2-base")
32
+ arabic_gpt2_model = AutoModel.from_pretrained("aubmindlab/aragpt2-base")
33
 
34
+ arabic_electra_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/araelectra-base-discriminator")
35
+ arabic_electra_model = AutoModel.from_pretrained("aubmindlab/araelectra-base-discriminator")
36
 
37
+ arabert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv02")
38
+ arabert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv02")
39
 
40
+ # إعداد OpenAI API
41
+ openai.api_key = os.getenv("sk-proj-62TDbO5KABSdkZaFPPD4T3BlbkFJkhqOYpHhL6OucTzNdWSU")
42
 
43
+ # إعداد farm-haystack
44
+ reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2")
45
 
46
+ # إعداد paddlenlp
47
+ ner_task = Taskflow("ner")
48
 
49
+ # دالة لتحليل النص باستخدام camel_tools
50
  def camel_ner_analysis(text):
 
 
51
  tokens = simple_word_tokenize(text)
52
+ entities = ner.predict(tokens)
 
 
53
  entity_dict = {"PERSON": [], "LOC": [], "ORG": [], "DATE": []}
54
  for token, tag in zip(tokens, entities):
55
  if tag in entity_dict:
 
133
  # دالة لتحليل النصوص واستخراج المعلومات وحفظ النتائج
134
  def analyze_and_complete(file_paths):
135
  results = []
136
+ output_directory = os.getenv("SPACE_DIR", "/app/output")
137
 
138
  for file_path in file_paths:
139
  if file_path.endswith(".pdf"):