Spaces:
Runtime error
Runtime error
File size: 3,786 Bytes
1a7a06e afa237b 1a7a06e 81fb5f1 fd88ce0 4d73d74 3bfacb1 fd88ce0 afa237b 1a7a06e 81fb5f1 afa237b 1a7a06e d092e25 1a7a06e d092e25 1a7a06e febdf4e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import numpy as np
import os
import streamlit as st
import sys
import urllib
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
def generate(tokenizer, model, text, features):
generated = tokenizer("<|startoftext|><|titlestart|>{}<|titleend|><|authornamebegin|>".format(text), return_tensors="pt").input_ids
count = 0
while count < features['num']:
sample_outputs = model.generate(
generated, do_sample=True, top_k=50,
max_length=features['max_length'], top_p=features['top_p'], temperature=features['t'] / 100.0, num_return_sequences=1,
)
decoded = tokenizer.decode(sample_outputs[0], skip_special_tokens=False)
print(decoded, file=sys.stderr)
if '<|authornamebegin|>' not in decoded:
continue
raw = decoded.split('<|authornamebegin|>')[-1]
if '<|authornameend|>' not in raw:
continue
end_name = raw.split('<|authornameend|>')
author = end_name[-2]
text = end_name[-1]
count += 1
st.markdown('**' + author.strip() + '**: ' + text.replace('<|endoftext|>', '').replace('<|pad|>', '').strip())
def load_model():
additional_special_tokens = ['<|titlestart|>', '<|titleend|>', '<|authornamebegin|>', '<|authornameend|>']
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>',
eos_token='<|endoftext|>', pad_token='<|pad|>',
additional_special_tokens=additional_special_tokens)
config = GPT2Config.from_json_file('./config.json')
model = GPT2LMHeadModel(config)
state_dict = torch.load('./pytorch_model.bin', map_location=torch.device('cpu'))
model.load_state_dict(state_dict)
return tokenizer, model
def main():
tokenizer, model = load_model()
st.title("YouTube comments generating project")
st.header('YouTube comments generator')
st.sidebar.title("Features")
seed = 27834096
default_control_features = ["Количество комментариев", "Температура", "Top-p"]
control_features = default_control_features
# Insert user-controlled values from sliders into the feature vector.
features = {
"num": st.sidebar.slider("Количество комментариев", 0, 20, 1, 1),
"t": st.sidebar.slider("Температура", 0, 300, 180, 1),
"top_p": st.sidebar.slider("Top-p", 0, 100, 95, 5),
"max_length": st.sidebar.slider("Максимальная длина комментария", 0, 300, 100, 5),
}
st.sidebar.title("Note")
st.sidebar.write(
"""
Изменяя значения, можно получить различные выводы модели
"""
)
st.sidebar.write(
"""
Значение температуры делится на 100
"""
)
st.sidebar.caption(f"Streamlit version `{st.__version__}`")
with st.form(key='my_form'):
url = st.text_input('Введите url видео на YouTube')
st.form_submit_button('Готово!')
if url:
params = {"format": "json", "url": url}
base_url = "https://www.youtube.com/oembed"
query_string = urllib.parse.urlencode(params)
base_url = base_url + "?" + query_string
with urllib.request.urlopen(base_url) as response:
response_text = response.read()
data = json.loads(response_text.decode())
st.write('Video Title: ' + data['title'])
st.video(url)
generate(tokenizer, model, data['title'], features)
if __name__ == "__main__":
main()
|