valgardg commited on
Commit
261524f
·
1 Parent(s): c5305e2

updated demo code in readme

Browse files
Files changed (1) hide show
  1. README.md +58 -26
README.md CHANGED
@@ -77,36 +77,68 @@ https://github.com/valgardg/learnice
77
  ## Usage
78
  ## Pos Tagging an Icelandic Sentence
79
  Here is an example of how to use the model to tag Icelandic sentences:
80
-
81
- from transformers import AutoModelForTokenClassification, AutoTokenizer
82
- import torch
 
83
  import json
84
 
85
- # Load the model and tokenizer
86
- model = AutoModelForTokenClassification.from_pretrained("<local_model_path>")
87
- tokenizer = AutoTokenizer.from_pretrained("<local_model_path>")
88
-
89
- # Load the ID-to-Tag mapping
90
- with open("id2tag_ftbi_ds100.json", "r") as f:
91
  id2tag = json.load(f)
92
 
93
- # Input Icelandic sentence
94
- sentence = "Hér er dæmasetning til að prófa."
95
-
96
- # Tokenize the sentence
97
- inputs = tokenizer(sentence, return_tensors="pt", truncation=True, is_split_into_words=False)
98
-
99
- # Get predictions
100
- outputs = model(**inputs).logits
101
- predictions = torch.argmax(outputs, dim=2).squeeze().tolist()
102
-
103
- # Map predictions to tags
104
- tags = [id2tag[str(pred)] for pred in predictions]
105
-
106
- # Combine tokens with tags
107
- tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().tolist())
108
- for token, tag in zip(tokens, tags):
109
- print(f"{token}: {tag}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  ## License
112
  MIT License
 
77
  ## Usage
78
  ## Pos Tagging an Icelandic Sentence
79
  Here is an example of how to use the model to tag Icelandic sentences:
80
+ ```
81
+ # Load the fine-tuned model
82
+ from transformers import BertTokenizerFast, BertForTokenClassification
83
+ import torch # type: ignore
84
  import json
85
 
86
+ # Load id2tag mapping
87
+ with open("../models/ftbi_ds100/id2tag_ftbi_ds100.json", "r") as f:
 
 
 
 
88
  id2tag = json.load(f)
89
 
90
+ # Load your tokenizer and model from saved checkpoint
91
+ tokenizer = BertTokenizerFast.from_pretrained("../models/ftbi_ds100")
92
+ model = BertForTokenClassification.from_pretrained("../models/ftbi_ds100")
93
+
94
+ # Function to predict tags on a new sentence
95
+ def predict_tags(sentence, tokenizer, model, id2tag):
96
+ # Tokenize the sentence
97
+ tokenized_input = tokenizer(sentence, is_split_into_words=True, return_tensors="pt")
98
+
99
+ # Get predictions
100
+ with torch.no_grad():
101
+ output = model(**tokenized_input)
102
+
103
+ # Get predicted label IDs
104
+ label_ids = torch.argmax(output.logits, dim=2).squeeze().tolist()
105
+
106
+ # Convert label IDs to tag names
107
+ tags = [id2tag[str(label_id)] if str(label_id) in id2tag else 'O' for label_id in label_ids]
108
+
109
+ # Match back to original words
110
+ word_ids = tokenized_input.word_ids() # This shows which original word each token corresponds to
111
+ word_tags = []
112
+ current_word_id = None
113
+ current_tags = []
114
+
115
+ # Aggregate tags for each word
116
+ for word_id, tag in zip(word_ids, tags):
117
+ if word_id is None: # Skip special tokens
118
+ continue
119
+ if word_id != current_word_id: # New word detected
120
+ if current_tags: # Append the aggregated tag for the previous word
121
+ word_tags.append(current_tags[0]) # Use the first tag, or customize this
122
+ current_word_id = word_id
123
+ current_tags = [tag]
124
+ else:
125
+ current_tags.append(tag) # Aggregate tags for the same word
126
+
127
+ # Append the last word's tag
128
+ if current_tags:
129
+ word_tags.append(current_tags[0]) # Use the first tag, or customize this
130
+
131
+ # Return the original words and their aggregated tags
132
+ return list(zip(sentence, word_tags))
133
+
134
+ # Example usage with a new Icelandic sentence
135
+ sentence = ["Hraunbær", "105", "."]
136
+ sentence = ["Niðurstaða", "þess", "var", "neikvæð", "."]
137
+ sentence = "Kl. 9-16 fótaaðgerðir og hárgreiðsla , Kl. 9.15 handavinna , Kl. 13.30 sungið við flygilinn , Kl. 14.30-16 dansað við lagaval Halldóru , kaffiveitingar allir velkomnir .".split()
138
+ predicted_tags = predict_tags(sentence, tokenizer, model, id2tag)
139
+
140
+ print("Predicted Tags:", predicted_tags)
141
+ ```
142
 
143
  ## License
144
  MIT License