Doron Adler commited on
Commit
22f0fab
Β·
1 Parent(s): b544395

Silly Ted-Talk Snippet Generator

Browse files
.gitattributes CHANGED
@@ -29,3 +29,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zstandard filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
29
  *.zip filter=lfs diff=lfs merge=lfs -text
30
  *.zstandard filter=lfs diff=lfs merge=lfs -text
31
  *tfevents* filter=lfs diff=lfs merge=lfs -text
32
+ model/pytorch_model.bin filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: SillyTedTalkSnippetGenerator
3
- emoji: ⚑
4
  colorFrom: green
5
  colorTo: pink
6
  sdk: gradio
 
1
  ---
2
+ title: Silly Ted-Talk Snippet Generator
3
+ emoji: πŸ§‘β€πŸ«
4
  colorFrom: green
5
  colorTo: pink
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+
4
+ title = "Silly Ted-Talk snippet generator"
5
+ description = "Tap on the \"Submit\" button to generate a random text snippet."
6
+ article = "<p>Fine tuned <a href=\"https://huggingface.co/EleutherAI/gpt-neo-125M\">EleutherAI/gpt-neo-125M</a> upon a formatted <a href=\"https://www.kaggle.com/datasets/miguelcorraljr/ted-ultimate-dataset\"> TED – Ultimate Dataset</a> (English)</p>"
7
+
8
+ model_id = "./model"
9
+ text_generator = pipeline('text-generation', model=model_id, tokenizer=model_id)
10
+ max_length = 128
11
+ top_k = 40
12
+ top_p = 0.92
13
+ temperature = 1.0
14
+
15
+ def text_generation(input_text = None):
16
+ if input_text == None or len(input_text) == 0:
17
+ input_text = "\t\""
18
+ else:
19
+ input_text.replace("\"", "")
20
+ if input_text.startswith("<|startoftext|>") == False:
21
+ input_text ="\t\"" + input_text
22
+ generated_text = text_generator(input_text,
23
+ max_length=max_length,
24
+ top_k=top_k,
25
+ top_p=top_p,
26
+ temperature=temperature,
27
+ do_sample=True,
28
+ repetition_penalty=2.0,
29
+ bos_token='<|startoftext|>',
30
+ eos_token='<|endoftext|>',
31
+ pad_token='<|pad|>',
32
+ unknown_token = '<|unknown|>',
33
+ num_return_sequences=1)
34
+ parsed_text = generated_text[0]["generated_text"].replace("<|startoftext|>", "").replace("\r","").replace("\n\n", "\n").replace("\t", " ").replace("<|pad|>", " * ").replace("\"\"", "\"")
35
+ return parsed_text
36
+
37
+ gr.Interface(
38
+ text_generation,
39
+ [gr.inputs.Textbox(lines=1, label="Enter input text or leave blank")],
40
+ outputs=[gr.outputs.Textbox(type="auto", label="Generated Ted-Talk snippet")],
41
+ title=title,
42
+ description=description,
43
+ article=article,
44
+ theme="default",
45
+ allow_flagging=False,
46
+ ).launch()
model/added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|pad|>": 50258,
3
+ "<|startoftext|>": 50257
4
+ }
model/config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "EleutherAI/gpt-neo-125M",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPTNeoForCausalLM"
6
+ ],
7
+ "attention_dropout": 0,
8
+ "attention_layers": [
9
+ "global",
10
+ "local",
11
+ "global",
12
+ "local",
13
+ "global",
14
+ "local",
15
+ "global",
16
+ "local",
17
+ "global",
18
+ "local",
19
+ "global",
20
+ "local"
21
+ ],
22
+ "attention_types": [
23
+ [
24
+ [
25
+ "global",
26
+ "local"
27
+ ],
28
+ 6
29
+ ]
30
+ ],
31
+ "bos_token_id": 50256,
32
+ "embed_dropout": 0,
33
+ "eos_token_id": 50256,
34
+ "gradient_checkpointing": false,
35
+ "hidden_size": 768,
36
+ "initializer_range": 0.02,
37
+ "intermediate_size": null,
38
+ "layer_norm_epsilon": 1e-05,
39
+ "max_position_embeddings": 2048,
40
+ "model_type": "gpt_neo",
41
+ "num_heads": 12,
42
+ "num_layers": 12,
43
+ "resid_dropout": 0,
44
+ "summary_activation": null,
45
+ "summary_first_dropout": 0.1,
46
+ "summary_proj_to_labels": true,
47
+ "summary_type": "cls_index",
48
+ "summary_use_proj": true,
49
+ "torch_dtype": "float32",
50
+ "transformers_version": "4.21.0",
51
+ "use_cache": true,
52
+ "vocab_size": 50259,
53
+ "window_size": 256
54
+ }
model/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a20a93d0bdf419bf744e508c8e75411812e896d1e43f3e57003f958c2867d7c
3
+ size 551191249
model/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|pad|>",
18
+ "lstrip": false,
19
+ "normalized": true,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": true,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/tokenizer_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|startoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
+ "model_max_length": 2048,
22
+ "name_or_path": "EleutherAI/gpt-neo-125M",
23
+ "pad_token": {
24
+ "__type": "AddedToken",
25
+ "content": "<|pad|>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ },
31
+ "special_tokens_map_file": null,
32
+ "tokenizer_class": "GPT2Tokenizer",
33
+ "unk_token": {
34
+ "__type": "AddedToken",
35
+ "content": "<|endoftext|>",
36
+ "lstrip": false,
37
+ "normalized": true,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ },
41
+ "unknown_token": "<|unknown|>"
42
+ }
model/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ tokenizers
4
+