import numpy as np import os import streamlit as st import sys import urllib import json import torch from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config def generate(tokenizer, model, text, features): generated = tokenizer("<|startoftext|><|titlestart|>{}<|titleend|><|authornamebegin|>".format(text), return_tensors="pt").input_ids count = 0 while count < features['num']: sample_outputs = model.generate( generated, do_sample=True, top_k=50, max_length=features['max_length'], top_p=features['top_p'], temperature=features['t'] / 100.0, num_return_sequences=1, ) decoded = tokenizer.decode(sample_outputs[0], skip_special_tokens=False) print(decoded, file=sys.stderr) if '<|authornamebegin|>' not in decoded: continue raw = decoded.split('<|authornamebegin|>')[-1] if '<|authornameend|>' not in raw: continue end_name = raw.split('<|authornameend|>') author = end_name[-2] text = end_name[-1] count += 1 st.markdown('**' + author.strip() + '**: ' + text.replace('<|endoftext|>', '').replace('<|pad|>', '').strip()) def load_model(): additional_special_tokens = ['<|titlestart|>', '<|titleend|>', '<|authornamebegin|>', '<|authornameend|>'] tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium', bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>', additional_special_tokens=additional_special_tokens) config = GPT2Config.from_json_file('./config.json') model = GPT2LMHeadModel(config) state_dict = torch.load('./pytorch_model.bin', map_location=torch.device('cpu')) model.load_state_dict(state_dict) return tokenizer, model def main(): tokenizer, model = load_model() st.title("YouTube comments generating project") st.header('YouTube comments generator') st.sidebar.title("Features") seed = 27834096 default_control_features = ["Количество комментариев", "Температура", "Top-p"] control_features = default_control_features # Insert user-controlled values from sliders into the feature vector. features = { "num": st.sidebar.slider("Количество комментариев", 0, 20, 1, 1), "t": st.sidebar.slider("Температура", 0, 300, 180, 1), "top_p": st.sidebar.slider("Top-p", 0, 100, 95, 5), "max_length": st.sidebar.slider("Максимальная длина комментария", 0, 300, 100, 5), } st.sidebar.title("Note") st.sidebar.write( """ Изменяя значения, можно получить различные выводы модели """ ) st.sidebar.write( """ Значение температуры делится на 100 """ ) st.sidebar.caption(f"Streamlit version `{st.__version__}`") with st.form(key='my_form'): url = st.text_input('Введите url видео на YouTube') st.form_submit_button('Готово!') if url: params = {"format": "json", "url": url} base_url = "https://www.youtube.com/oembed" query_string = urllib.parse.urlencode(params) base_url = base_url + "?" + query_string with urllib.request.urlopen(base_url) as response: response_text = response.read() data = json.loads(response_text.decode()) st.write('Video Title: ' + data['title']) st.video(url) generate(tokenizer, model, data['title'], features) if __name__ == "__main__": main()