Gleb Vinarskis
commited on
Commit
·
ae8276c
1
Parent(s):
8953dd4
added emas pipeline
Browse files- __init__.py +0 -0
- config.json +21 -0
- configuration_lang.py +18 -0
- impresso_langident_wrapper.py +0 -10
- lang_detect.py +29 -0
- modeling_lang.py +59 -0
- requirements.txt +2 -0
- test.py +13 -0
- test_floret.py +10 -0
__init__.py
ADDED
File without changes
|
config.json
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "impresso-project/impresso-langident",
|
3 |
+
"architectures": [
|
4 |
+
"LangDetectorModel"
|
5 |
+
],
|
6 |
+
"filename": "langident-v1.0.0.bin",
|
7 |
+
"attention_probs_dropout_prob": 0.1,
|
8 |
+
"auto_map": {
|
9 |
+
"AutoConfig": "configuration_lang.ImpressoConfig",
|
10 |
+
"AutoModelForTokenClassification": "modeling_lang.LangDetectorModel"
|
11 |
+
},
|
12 |
+
"custom_pipelines": {
|
13 |
+
"lang-detect": {
|
14 |
+
"impl": "lang_detect.LangDetectionPipeline",
|
15 |
+
"pt": "AutoModelForTokenClassification"
|
16 |
+
}
|
17 |
+
},
|
18 |
+
"model_type": "lang_detect",
|
19 |
+
"torch_dtype": "float32",
|
20 |
+
"transformers_version": "4.49.0"
|
21 |
+
}
|
configuration_lang.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import PretrainedConfig
|
2 |
+
import torch
|
3 |
+
|
4 |
+
|
5 |
+
class ImpressoConfig(PretrainedConfig):
|
6 |
+
model_type = "lang_detect"
|
7 |
+
|
8 |
+
def __init__(
|
9 |
+
self,
|
10 |
+
filename=None,
|
11 |
+
**kwargs,
|
12 |
+
):
|
13 |
+
super().__init__(**kwargs)
|
14 |
+
self.filename = filename
|
15 |
+
|
16 |
+
|
17 |
+
# Register the configuration with the transformers library
|
18 |
+
ImpressoConfig.register_for_auto_class()
|
impresso_langident_wrapper.py
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
import floret # Assuming Floret is already installed
|
2 |
-
|
3 |
-
|
4 |
-
class FloretLangIdentifier:
|
5 |
-
def __init__(self, model_path):
|
6 |
-
self.model = floret.load_model(model_path)
|
7 |
-
|
8 |
-
def predict(self, text):
|
9 |
-
predictions = self.model.predict(text)
|
10 |
-
return predictions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lang_detect.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import Pipeline
|
2 |
+
|
3 |
+
|
4 |
+
class LangDetectionPipeline(Pipeline):
|
5 |
+
|
6 |
+
def _sanitize_parameters(self, **kwargs):
|
7 |
+
preprocess_kwargs = {}
|
8 |
+
if "text" in kwargs:
|
9 |
+
preprocess_kwargs["text"] = kwargs["text"]
|
10 |
+
return preprocess_kwargs, {}, {}
|
11 |
+
|
12 |
+
def preprocess(self, text, **kwargs):
|
13 |
+
# Nothing to preprocess
|
14 |
+
return text
|
15 |
+
|
16 |
+
def _forward(self, text, **kwargs):
|
17 |
+
predictions, probabilities = self.model(text)
|
18 |
+
return predictions, probabilities
|
19 |
+
|
20 |
+
def postprocess(self, outputs, **kwargs):
|
21 |
+
predictions, probabilities = outputs
|
22 |
+
label = predictions[0][0].replace("__label__", "") # Remove __label__ prefix
|
23 |
+
confidence = float(
|
24 |
+
probabilities[0][0]
|
25 |
+
) # Convert to float for JSON serialization
|
26 |
+
|
27 |
+
# Format as JSON-compatible dictionary
|
28 |
+
model_output = {"label": label, "confidence": round(confidence * 100, 2)}
|
29 |
+
return model_output
|
modeling_lang.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from transformers import PreTrainedModel
|
4 |
+
import logging
|
5 |
+
import floret
|
6 |
+
import os
|
7 |
+
from huggingface_hub import hf_hub_download
|
8 |
+
from .configuration_lang import ImpressoConfig
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
|
12 |
+
|
13 |
+
class LangDetectorModel(PreTrainedModel):
|
14 |
+
config_class = ImpressoConfig
|
15 |
+
|
16 |
+
def __init__(self, config):
|
17 |
+
super().__init__(config)
|
18 |
+
self.config = config
|
19 |
+
|
20 |
+
# Dummy for device checking
|
21 |
+
self.dummy_param = nn.Parameter(torch.zeros(1))
|
22 |
+
bin_filename = self.config.config.filename
|
23 |
+
|
24 |
+
# Check if the file is already present locally, else download it
|
25 |
+
if not os.path.exists(bin_filename):
|
26 |
+
# print(f"{bin_filename} not found locally, downloading from Hugging Face hub...")
|
27 |
+
bin_filename = hf_hub_download(repo_id=self.config.config._name_or_path,
|
28 |
+
filename=bin_filename)
|
29 |
+
|
30 |
+
# Load floret model using the full path
|
31 |
+
self.model_floret = floret.load_model(bin_filename)
|
32 |
+
|
33 |
+
def forward(self, input_ids, **kwargs):
|
34 |
+
if isinstance(input_ids, str):
|
35 |
+
# If the input is a single string, make it a list for floret
|
36 |
+
texts = [input_ids]
|
37 |
+
elif isinstance(input_ids, list) and all(isinstance(t, str) for t in input_ids):
|
38 |
+
texts = input_ids
|
39 |
+
else:
|
40 |
+
raise ValueError(f"Unexpected input type: {type(input_ids)}")
|
41 |
+
|
42 |
+
predictions, probabilities = self.model_floret.predict(texts, k=1)
|
43 |
+
return (
|
44 |
+
predictions,
|
45 |
+
probabilities,
|
46 |
+
)
|
47 |
+
|
48 |
+
@property
|
49 |
+
def device(self):
|
50 |
+
return next(self.parameters()).device
|
51 |
+
|
52 |
+
@classmethod
|
53 |
+
def from_pretrained(cls, *args, **kwargs):
|
54 |
+
# print("Ignoring weights and using custom initialization.")
|
55 |
+
# Manually create the config
|
56 |
+
config = ImpressoConfig(**kwargs)
|
57 |
+
# Pass the manually created config to the class
|
58 |
+
model = cls(config)
|
59 |
+
return model
|
requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
huggingface_hub
|
2 |
+
floret
|
test.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoModelForTokenClassification, AutoTokenizer
|
2 |
+
from transformers import pipeline
|
3 |
+
|
4 |
+
MODEL_NAME = "impresso-project/impresso-langident"
|
5 |
+
|
6 |
+
lang_pipeline = pipeline(
|
7 |
+
"lang-detect", model=MODEL_NAME, trust_remote_code=True, device="cpu"
|
8 |
+
)
|
9 |
+
|
10 |
+
sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
|
11 |
+
|
12 |
+
langs = lang_pipeline(sentence)
|
13 |
+
langs
|
test_floret.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import floret
|
2 |
+
|
3 |
+
model_floret = floret.load_model("LID-40-3-2000000-1-4.bin")
|
4 |
+
print(type(model_floret))
|
5 |
+
input_ids = 'this is a text'
|
6 |
+
|
7 |
+
input_ids = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
|
8 |
+
|
9 |
+
print(model_floret.predict([input_ids], k=1))
|
10 |
+
|