masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0 · Custom Code for Playing with the Model

import spacy
from spacy import displacy
from spacy.tokens import Doc, Span
from transformers import pipeline

# Load the model
pipe = pipeline("token-classification", model="masakhane/afroxlmr-large-ner-masakhaner-1.0_2.0")

# Custom functions
def get_label(entity_label):
    # Directly return the label without the "B-" or "I-" prefix
    if entity_label.startswith(('B-', 'I-')):
        return entity_label[2:]
    return entity_label

def mskner_entites(text):
  mskner = pipe(text)
  # Load a blank spaCy model
  nlp = spacy.blank("en")  # Using "en" just for creating a blank model
  words = text.split()  # Simple split; adjust as needed for your tokenization
  doc = Doc(nlp.vocab, words=words)
  # Initialize an empty list to store entity spans
  spans = []
  # Iterate over detected entities to create spans
  for item in mskner:
      # Map entity type from transformers output to spaCy's format
      entity_type = get_label(item['entity'])
      # Transformers provides character offsets; find corresponding tokens in spaCy Doc
      start_char = item['start']
      end_char = item['end']
      start_token = None
      end_token = None

      for token in doc:
          # Check if token start matches entity start
          if start_char == token.idx:
              start_token = token.i
          # Token end matches entity end (token.idx + len(token) - 1 because end_char is inclusive)
          if end_char - 1 <= token.idx + len(token) - 1:
              end_token = token.i + 1
              break  # Stop once the end token is found

      # Create a span for this entity if valid token indices were found
      if start_token is not None and end_token is not None:
          span = Span(doc, start_token, end_token, label=entity_type.upper())  # Ensure label is in uppercase
          spans.append(span)

  # Set the entities in the Doc
  doc.ents = spans

  # Visualize with displaCy
  displacy.render(doc, style="ent", jupyter=True)

# Output from the custom function
mskner_entites("Gazeti la Jumapili 21 linasema mvua imenyesha Nairobi na rais atahudhuria kwa ajili ya familia")