Migrate model card from transformers-repo
Browse filesRead announcement at https://discuss.huggingface.co/t/announcement-all-model-cards-will-be-migrated-to-hf-co-model-repos/2755
Original file history: https://github.com/huggingface/transformers/commits/master/model_cards/Norod78/hewiki-articles-distilGPT2py-il/README.md
README.md
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
language: he
|
3 |
+
|
4 |
+
thumbnail: https://avatars1.githubusercontent.com/u/3617152?norod.jpg
|
5 |
+
widget:
|
6 |
+
- text: "<|startoftext|>החוק השני של מועדון קרב הוא"
|
7 |
+
- text: "<|startoftext|>ראש הממשלה בן גוריון"
|
8 |
+
- text: "<|startoftext|>למידת מכונה (סרט)"
|
9 |
+
- text: "<|startoftext|>מנשה פומפרניקל"
|
10 |
+
- text: "<|startoftext|>אי שוויון "
|
11 |
+
|
12 |
+
license: mit
|
13 |
+
---
|
14 |
+
|
15 |
+
|
16 |
+
# hewiki-articles-distilGPT2py-il
|
17 |
+
|
18 |
+
## A tiny GPT2 model for generating Hebrew text
|
19 |
+
|
20 |
+
A distilGPT2 sized model. <br>
|
21 |
+
Training data was hewiki-20200701-pages-articles-multistream.xml.bz2 from https://dumps.wikimedia.org/hewiki/20200701/ <br>
|
22 |
+
XML has been converted to plain text using Wikipedia Extractor http://medialab.di.unipi.it/wiki/Wikipedia_Extractor <br>
|
23 |
+
I then added <|startoftext|> and <|endoftext|> markers and deleted empty lines. <br>
|
24 |
+
|
25 |
+
#### How to use
|
26 |
+
|
27 |
+
```python
|
28 |
+
import torch
|
29 |
+
import torch.nn as nn
|
30 |
+
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
31 |
+
|
32 |
+
tokenizer = GPT2Tokenizer.from_pretrained("Norod78/hewiki-articles-distilGPT2py-il")
|
33 |
+
model = GPT2LMHeadModel.from_pretrained("Norod78/hewiki-articles-distilGPT2py-il").eval()
|
34 |
+
|
35 |
+
bos_token = tokenizer.bos_token #Beginning of sentace
|
36 |
+
eos_token = tokenizer.eos_token #End of sentence
|
37 |
+
|
38 |
+
def generate_word(model, tokens_tensor, temperature=1.0):
|
39 |
+
"""
|
40 |
+
Sample a word given a tensor of tokens of previous words from a model. Given
|
41 |
+
the words we have, sample a plausible word. Temperature is used for
|
42 |
+
controlling randomness. If using temperature==0 we simply use a greedy arg max.
|
43 |
+
Else, we sample from a multinomial distribution using a lower inverse
|
44 |
+
temperature to allow for more randomness to escape repetitions.
|
45 |
+
"""
|
46 |
+
with torch.no_grad():
|
47 |
+
outputs = model(tokens_tensor)
|
48 |
+
predictions = outputs[0]
|
49 |
+
if temperature>0:
|
50 |
+
# Make the distribution more or less skewed based on the temperature
|
51 |
+
predictions = outputs[0]/temperature
|
52 |
+
# Sample from the distribution
|
53 |
+
softmax = nn.Softmax(dim=0)
|
54 |
+
predicted_index = torch.multinomial(softmax(predictions[0,-1,:]),1).item()
|
55 |
+
# Simply take the arg-max of the distribution
|
56 |
+
else:
|
57 |
+
predicted_index = torch.argmax(predictions[0, -1, :]).item()
|
58 |
+
# Decode the encoding to the corresponding word
|
59 |
+
predicted_text = tokenizer.decode([predicted_index])
|
60 |
+
return predicted_text
|
61 |
+
|
62 |
+
def generate_sentence(model, tokenizer, initial_text, temperature=1.0):
|
63 |
+
""" Generate a sentence given some initial text using a model and a tokenizer.
|
64 |
+
Returns the new sentence. """
|
65 |
+
|
66 |
+
# Encode a text inputs
|
67 |
+
text = ""
|
68 |
+
sentence = text
|
69 |
+
|
70 |
+
# We avoid an infinite loop by setting a maximum range
|
71 |
+
for i in range(0,84):
|
72 |
+
indexed_tokens = tokenizer.encode(initial_text + text)
|
73 |
+
|
74 |
+
# Convert indexed tokens in a PyTorch tensor
|
75 |
+
tokens_tensor = torch.tensor([indexed_tokens])
|
76 |
+
|
77 |
+
new_word = generate_word(model, tokens_tensor, temperature=temperature)
|
78 |
+
|
79 |
+
# Here the temperature is slowly decreased with each generated word,
|
80 |
+
# this ensures that the sentence (ending) makes more sense.
|
81 |
+
# We don't decrease to a temperature of 0.0 to leave some randomness in.
|
82 |
+
if temperature<(1-0.008):
|
83 |
+
temperature += 0.008
|
84 |
+
else:
|
85 |
+
temperature = 0.996
|
86 |
+
|
87 |
+
text = text+new_word
|
88 |
+
|
89 |
+
# Stop generating new words when we have reached the end of the line or the poem
|
90 |
+
if eos_token in new_word:
|
91 |
+
# returns new sentence and whether poem is done
|
92 |
+
return (text.replace(eos_token,"").strip(), True)
|
93 |
+
elif '/' in new_word:
|
94 |
+
return (text.strip(), False)
|
95 |
+
elif bos_token in new_word:
|
96 |
+
return (text.replace(bos_token,"").strip(), False)
|
97 |
+
|
98 |
+
return (text, True)
|
99 |
+
|
100 |
+
for output_num in range(1,5):
|
101 |
+
init_text = "בוקר טוב"
|
102 |
+
text = bos_token + init_text
|
103 |
+
for i in range(0,84):
|
104 |
+
sentence = generate_sentence(model, tokenizer, text, temperature=0.9)
|
105 |
+
text = init_text + sentence[0]
|
106 |
+
print(text)
|
107 |
+
if (sentence[1] == True):
|
108 |
+
break
|
109 |
+
```
|