langident
custom_code
impresso-langident / langident.py
Simon Clematide
Fix confidence score rounding in model output for JSON compatibility
996ea64
raw
history blame
970 Bytes
from transformers import Pipeline
class LangDetectionPipeline(Pipeline):
def _sanitize_parameters(self, **kwargs):
preprocess_kwargs = {}
if "text" in kwargs:
preprocess_kwargs["text"] = kwargs["text"]
return preprocess_kwargs, {}, {}
def preprocess(self, text, **kwargs):
# Nothing to preprocess
return text
def _forward(self, text, **kwargs):
predictions, probabilities = self.model(text.replace("\n", " "))
return predictions, probabilities
def postprocess(self, outputs, **kwargs):
predictions, probabilities = outputs
label = predictions[0][0].replace("__label__", "") # Remove __label__ prefix
confidence = float(
probabilities[0][0]
) # Convert to float for JSON serialization
# Format as JSON-compatible dictionary
model_output = {"language": label, "score": round(confidence, 2)}
return model_output