|
|
|
"""Copy of english model testing.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/13LT1keMRDkMSrOYjvzkneI_PaRnLQWl0 |
|
""" |
|
|
|
|
|
|
|
from nltk.corpus import stopwords |
|
import pandas as pd |
|
|
|
import numpy as np |
|
import tensorflow as tf |
|
from tensorflow.keras.preprocessing.text import Tokenizer |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
from tensorflow.keras.models import Sequential, Model |
|
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Input, GRU |
|
|
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
import nltk |
|
import requests |
|
nltk.download('stopwords') |
|
nltk.download('wordnet') |
|
nltk.download('punkt') |
|
nltk.download('averaged_perceptron_tagger') |
|
|
|
eurl = 'https://raw.githubusercontent.com/sofiagiaccotto/newengpoemdatasetNLP/main/poems.txt' |
|
ans = requests.get(eurl) |
|
edf = ans.text |
|
|
|
tokenizer = Tokenizer() |
|
|
|
corpus = edf.lower().split("\n") |
|
|
|
tokenizer.fit_on_texts(corpus) |
|
total_words = len(tokenizer.word_index) + 1 |
|
|
|
print(tokenizer.word_index) |
|
print(total_words) |
|
|
|
input_sequences = [] |
|
for line in corpus: |
|
token_list = tokenizer.texts_to_sequences([line])[0] |
|
for i in range(1, len(token_list)): |
|
n_gram_sequence = token_list[:i+1] |
|
input_sequences.append(n_gram_sequence) |
|
|
|
|
|
max_sequence_len = max([len(x) for x in input_sequences]) |
|
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')) |
|
|
|
|
|
xs, labels = input_sequences[:,:-1],input_sequences[:,-1] |
|
|
|
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words) |
|
|
|
import requests |
|
|
|
url = 'https://github.com/Obai33/NLP_PoemGenerationDatasets/raw/main/modeleng1.h5' |
|
|
|
local_filename = 'modeleng1.h5' |
|
|
|
|
|
response = requests.get(url) |
|
with open(local_filename, 'wb') as f: |
|
f.write(response.content) |
|
|
|
|
|
model = tf.keras.models.load_model(local_filename) |
|
|
|
def generate_english_text(seed_text, next_words=50): |
|
generated_text = seed_text |
|
for _ in range(next_words): |
|
token_list = tokenizer.texts_to_sequences([generated_text])[0] |
|
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre') |
|
predicted = np.argmax(model.predict(token_list), axis=-1) |
|
output_word = "" |
|
for word, index in tokenizer.word_index.items(): |
|
if index == predicted: |
|
output_word = word |
|
break |
|
generated_text += " " + output_word |
|
return generated_text |
|
|
|
import gradio as gr |
|
|
|
|
|
iface = gr.Interface( |
|
fn=generate_english_text, |
|
inputs="text", |
|
outputs="text", |
|
title="English Poetry Generation", |
|
description="Enter English text to generate a small poem.", |
|
theme="compact" |
|
) |
|
|
|
iface.launch() |