ghizlaneimane commited on
Commit
61db26f
·
verified ·
1 Parent(s): 9c9ef6b

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +181 -0
main.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Form
2
+ from fastapi.responses import JSONResponse
3
+ from transformers import pipeline
4
+ from typing import Optional
5
+ import io
6
+ from PIL import Image
7
+ import tempfile
8
+ import os
9
+ import fitz # PyMuPDF
10
+ import docx
11
+ import pandas as pd
12
+ import pptx
13
+ from fastapi.middleware.cors import CORSMiddleware
14
+ from langdetect import detect
15
+
16
+ app = FastAPI()
17
+
18
+ app.add_middleware(
19
+ CORSMiddleware,
20
+ allow_origins=["*"],
21
+ allow_credentials=True,
22
+ allow_methods=["*"],
23
+ allow_headers=["*"],
24
+ )
25
+
26
+ # Liste des langues supportées
27
+ SUPPORTED_LANGUAGES = ["fr", "en", "de", "es", "it", "zh", "ar"]
28
+
29
+ # Modèles de traduction valides (existants sur Hugging Face)
30
+ translation_models = {
31
+ "fr-en": "Helsinki-NLP/opus-mt-fr-en",
32
+ "en-fr": "Helsinki-NLP/opus-mt-en-fr",
33
+ "fr-de": "Helsinki-NLP/opus-mt-fr-de",
34
+ "de-fr": "Helsinki-NLP/opus-mt-de-fr",
35
+ "fr-es": "Helsinki-NLP/opus-mt-fr-es",
36
+ "es-fr": "Helsinki-NLP/opus-mt-es-fr",
37
+ "en-zh": "Helsinki-NLP/opus-mt-en-zh",
38
+ "zh-en": "Helsinki-NLP/opus-mt-zh-en",
39
+ "en-it": "Helsinki-NLP/opus-mt-en-it",
40
+ "it-en": "Helsinki-NLP/opus-mt-it-en",
41
+ "en-ar": "Helsinki-NLP/opus-mt-en-ar",
42
+ "ar-en": "Helsinki-NLP/opus-mt-ar-en",
43
+ "en-es": "Helsinki-NLP/opus-mt-en-es",
44
+ "en-de": "Helsinki-NLP/opus-mt-en-de",
45
+ "es-ar": "Helsinki-NLP/opus-mt-es-ar",
46
+ "es-en": "Helsinki-NLP/opus-mt-es-en",
47
+ "es-de": "Helsinki-NLP/opus-mt-es-de",
48
+ "es-it": "Helsinki-NLP/opus-mt-es-it",
49
+ "es-zh": "Helsinki-NLP/opus-mt-es-zh",
50
+ "ar-fr": "Helsinki-NLP/opus-mt-ar-fr",
51
+ "ar-de": "Helsinki-NLP/opus-mt-ar-de",
52
+ "ar-es": "Helsinki-NLP/opus-mt-ar-es",
53
+ "ar-it": "Helsinki-NLP/opus-mt-ar-it",
54
+ "ar-zh": "Helsinki-NLP/opus-mt-ar-zh",
55
+ "de-en": "Helsinki-NLP/opus-mt-de-en",
56
+ "de-de": "Helsinki-NLP/opus-mt-de-de",
57
+ "de-es": "Helsinki-NLP/opus-mt-de-es",
58
+ "de-it": "Helsinki-NLP/opus-mt-de-it",
59
+ "de-zh": "Helsinki-NLP/opus-mt-de-zh",
60
+ "de-ar": "Helsinki-NLP/opus-mt-de-ar",
61
+ "it-fr": "Helsinki-NLP/opus-mt-it-fr",
62
+ "it-de": "Helsinki-NLP/opus-mt-it-de",
63
+ "it-es": "Helsinki-NLP/opus-mt-it-es",
64
+ "it-zh": "Helsinki-NLP/opus-mt-it-zh",
65
+ "it-ar": "Helsinki-NLP/opus-mt-it-ar",
66
+ "zh-fr": "Helsinki-NLP/opus-mt-zh-fr",
67
+ "zh-de": "Helsinki-NLP/opus-mt-zh-en",
68
+ "zh-it": "Helsinki-NLP/opus-mt-zh-it",
69
+ "zh-es": "Helsinki-NLP/opus-mt-zh-es",
70
+ "zh-ar": "Helsinki-NLP/opus-mt-zh-ar",
71
+
72
+
73
+ }
74
+
75
+ def extract_text_from_pdf(file_path):
76
+ text = ""
77
+ with fitz.open(file_path) as doc:
78
+ for page in doc:
79
+ text += page.get_text("text") + "\n"
80
+ return text
81
+
82
+ def extract_text_from_docx(file_path):
83
+ doc = docx.Document(file_path)
84
+ return "\n".join([p.text for p in doc.paragraphs])
85
+
86
+ def extract_text_from_pptx(file_path):
87
+ presentation = pptx.Presentation(file_path)
88
+ text = []
89
+ for slide in presentation.slides:
90
+ for shape in slide.shapes:
91
+ if hasattr(shape, "text"):
92
+ text.append(shape.text)
93
+ return "\n".join(text)
94
+
95
+ def extract_text_from_excel(file_path):
96
+ df = pd.read_excel(file_path, engine="openpyxl")
97
+ return df.to_string(index=False)
98
+
99
+ def chunk_text(text, max_length=512):
100
+ words = text.split()
101
+ chunks, current_chunk = [], []
102
+
103
+ for word in words:
104
+ if len(" ".join(current_chunk) + " " + word) <= max_length:
105
+ current_chunk.append(word)
106
+ else:
107
+ chunks.append(" ".join(current_chunk))
108
+ current_chunk = [word]
109
+
110
+ if current_chunk:
111
+ chunks.append(" ".join(current_chunk))
112
+
113
+ return chunks
114
+
115
+ def translate_text(text, source_lang, target_lang):
116
+ if source_lang not in SUPPORTED_LANGUAGES or target_lang not in SUPPORTED_LANGUAGES:
117
+ return None # Langue non supportée
118
+
119
+ model_key = f"{source_lang}-{target_lang}"
120
+ if model_key in translation_models:
121
+ model_name = translation_models[model_key]
122
+ translator = pipeline("translation", model=model_name)
123
+ translated_chunks = [translator(chunk)[0]["translation_text"] for chunk in chunk_text(text)]
124
+ return " ".join(translated_chunks)
125
+
126
+ # Si pas de traduction directe, utiliser l'anglais comme pivot
127
+ model_to_en = f"{source_lang}-en"
128
+ model_from_en = f"en-{target_lang}"
129
+
130
+ if model_to_en in translation_models and model_from_en in translation_models:
131
+ translator_to_en = pipeline("translation", model=translation_models[model_to_en])
132
+ translator_from_en = pipeline("translation", model=translation_models[model_from_en])
133
+
134
+ intermediate_texts = [translator_to_en(chunk)[0]["translation_text"] for chunk in chunk_text(text)]
135
+ intermediate_text = " ".join(intermediate_texts)
136
+
137
+ final_texts = [translator_from_en(chunk)[0]["translation_text"] for chunk in chunk_text(intermediate_text)]
138
+ return " ".join(final_texts)
139
+
140
+ return None # Pas de modèle disponible
141
+
142
+ @app.post("/translate")
143
+ async def translate_document(file: UploadFile = File(...), language: str = Form(...)):
144
+ try:
145
+ suffix = file.filename.split(".")[-1].lower()
146
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=f".{suffix}")
147
+ temp_file.write(await file.read())
148
+ temp_file.close()
149
+
150
+ extractors = {
151
+ "pdf": extract_text_from_pdf,
152
+ "docx": extract_text_from_docx,
153
+ "pptx": extract_text_from_pptx,
154
+ "xls": extract_text_from_excel,
155
+ "xlsx": extract_text_from_excel
156
+ }
157
+
158
+ if suffix not in extractors:
159
+ return JSONResponse({"error": "Format non supporté"}, status_code=400)
160
+
161
+ text = extractors[suffix](temp_file.name)
162
+ os.remove(temp_file.name)
163
+
164
+ if not text.strip():
165
+ return JSONResponse({"error": "Aucun texte détecté"}, status_code=400)
166
+
167
+ detected_lang = detect(text)
168
+ if detected_lang not in SUPPORTED_LANGUAGES:
169
+ return JSONResponse({"error": f"Langue non supportée : {detected_lang}"}, status_code=400)
170
+
171
+ if detected_lang == language:
172
+ return JSONResponse({"translation": text, "note": "Déjà dans la langue choisie."})
173
+
174
+ translated_text = translate_text(text, detected_lang, language)
175
+ if translated_text:
176
+ return JSONResponse({"translation": translated_text})
177
+ else:
178
+ return JSONResponse({"error": "Aucun modèle de traduction trouvé."}, status_code=400)
179
+
180
+ except Exception as e:
181
+ return JSONResponse({"error": str(e)}, status_code=500)