added emas pipeline

Browse files

Files changed (9) hide show

__init__.py +0 -0
config.json +21 -0
configuration_lang.py +18 -0
impresso_langident_wrapper.py +0 -10
lang_detect.py +29 -0
modeling_lang.py +59 -0
requirements.txt +2 -0
test.py +13 -0
test_floret.py +10 -0

__init__.py ADDED Viewed

File without changes

config.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "_name_or_path": "impresso-project/impresso-langident",
+  "architectures": [
+    "LangDetectorModel"
+  ],
+  "filename": "langident-v1.0.0.bin",
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration_lang.ImpressoConfig",
+    "AutoModelForTokenClassification": "modeling_lang.LangDetectorModel"
+  },
+  "custom_pipelines": {
+    "lang-detect": {
+      "impl": "lang_detect.LangDetectionPipeline",
+      "pt": "AutoModelForTokenClassification"
+    }
+  },
+  "model_type": "lang_detect",
+  "torch_dtype": "float32",
+  "transformers_version": "4.49.0"
+}

configuration_lang.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from transformers import PretrainedConfig
+import torch
+class ImpressoConfig(PretrainedConfig):
+    model_type = "lang_detect"
+    def __init__(
+        self,
+        filename=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.filename = filename
+# Register the configuration with the transformers library
+ImpressoConfig.register_for_auto_class()

impresso_langident_wrapper.py DELETED Viewed

@@ -1,10 +0,0 @@
-import floret  # Assuming Floret is already installed
-class FloretLangIdentifier:
-    def __init__(self, model_path):
-        self.model = floret.load_model(model_path)
-    def predict(self, text):
-        predictions = self.model.predict(text)
-        return predictions

lang_detect.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from transformers import Pipeline
+class LangDetectionPipeline(Pipeline):
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        if "text" in kwargs:
+            preprocess_kwargs["text"] = kwargs["text"]
+        return preprocess_kwargs, {}, {}
+    def preprocess(self, text, **kwargs):
+        # Nothing to preprocess
+        return text
+    def _forward(self, text, **kwargs):
+        predictions, probabilities = self.model(text)
+        return predictions, probabilities
+    def postprocess(self, outputs, **kwargs):
+        predictions, probabilities = outputs
+        label = predictions[0][0].replace("__label__", "")  # Remove __label__ prefix
+        confidence = float(
+            probabilities[0][0]
+        )  # Convert to float for JSON serialization
+        # Format as JSON-compatible dictionary
+        model_output = {"label": label, "confidence": round(confidence * 100, 2)}
+        return model_output

modeling_lang.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+import logging
+import floret
+import os
+from huggingface_hub import hf_hub_download
+from .configuration_lang import ImpressoConfig
+logger = logging.getLogger(__name__)
+class LangDetectorModel(PreTrainedModel):
+    config_class = ImpressoConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        # Dummy for device checking
+        self.dummy_param = nn.Parameter(torch.zeros(1))
+        bin_filename = self.config.config.filename
+        # Check if the file is already present locally, else download it
+        if not os.path.exists(bin_filename):
+            # print(f"{bin_filename} not found locally, downloading from Hugging Face hub...")
+            bin_filename = hf_hub_download(repo_id=self.config.config._name_or_path,
+                                       filename=bin_filename)
+        # Load floret model using the full path
+        self.model_floret = floret.load_model(bin_filename)
+    def forward(self, input_ids, **kwargs):
+        if isinstance(input_ids, str):
+            # If the input is a single string, make it a list for floret
+            texts = [input_ids]
+        elif isinstance(input_ids, list) and all(isinstance(t, str) for t in input_ids):
+            texts = input_ids
+        else:
+            raise ValueError(f"Unexpected input type: {type(input_ids)}")
+        predictions, probabilities = self.model_floret.predict(texts, k=1)
+        return (
+            predictions,
+            probabilities,
+        )
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        # print("Ignoring weights and using custom initialization.")
+        # Manually create the config
+        config = ImpressoConfig(**kwargs)
+        # Pass the manually created config to the class
+        model = cls(config)
+        return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ huggingface_hub
2	+ floret

test.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from transformers import AutoModelForTokenClassification, AutoTokenizer
+from transformers import pipeline
+MODEL_NAME = "impresso-project/impresso-langident"
+lang_pipeline = pipeline(
+    "lang-detect", model=MODEL_NAME, trust_remote_code=True, device="cpu"
+)
+sentence = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
+langs = lang_pipeline(sentence)
+langs

test_floret.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import floret
+model_floret = floret.load_model("LID-40-3-2000000-1-4.bin")
+print(type(model_floret))
+input_ids = 'this is a text'
+input_ids = "En l'an 1348, au plus fort des ravages de la peste noire à travers l'Europe, le Royaume de France se trouvait à la fois au bord du désespoir et face à une opportunité. À la cour du roi Philippe VI, les murs du Louvre étaient animés par les rapports sombres venus de Paris et des villes environnantes. La peste ne montrait aucun signe de répit, et le chancelier Guillaume de Nogaret, le conseiller le plus fidèle du roi, portait le lourd fardeau de gérer la survie du royaume."
+print(model_floret.predict([input_ids], k=1))