Custom Code for Playing with the Model
#3
by
Svngoku
- opened
import spacy
from spacy import displacy
from spacy.tokens import Doc, Span
from transformers import pipeline
# Load the model
pipe = pipeline("token-classification", model="masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")
# Custom functions
def get_label(entity_label):
# Directly return the label without the "B-" or "I-" prefix
if entity_label.startswith(('B-', 'I-')):
return entity_label[2:]
return entity_label
def mskner_entites(text):
mskner = pipe(text)
# Load a blank spaCy model
nlp = spacy.blank("en") # Using "en" just for creating a blank model
words = text.split() # Simple split; adjust as needed for your tokenization
doc = Doc(nlp.vocab, words=words)
# Initialize an empty list to store entity spans
spans = []
# Iterate over detected entities to create spans
for item in mskner:
# Map entity type from transformers output to spaCy's format
entity_type = get_label(item['entity'])
# Transformers provides character offsets; find corresponding tokens in spaCy Doc
start_char = item['start']
end_char = item['end']
start_token = None
end_token = None
for token in doc:
# Check if token start matches entity start
if start_char == token.idx:
start_token = token.i
# Token end matches entity end (token.idx + len(token) - 1 because end_char is inclusive)
if end_char - 1 <= token.idx + len(token) - 1:
end_token = token.i + 1
break # Stop once the end token is found
# Create a span for this entity if valid token indices were found
if start_token is not None and end_token is not None:
span = Span(doc, start_token, end_token, label=entity_type.upper()) # Ensure label is in uppercase
spans.append(span)
# Set the entities in the Doc
doc.ents = spans
# Visualize with displaCy
displacy.render(doc, style="ent", jupyter=True)
# Output from the custom function
mskner_entites("Gazeti la Jumapili 21 linasema mvua imenyesha Nairobi na rais atahudhuria kwa ajili ya familia")