HamidBekam commited on
Commit
5ac59b2
·
1 Parent(s): 5146ca2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -90
app.py CHANGED
@@ -1,91 +1,2 @@
1
- import streamlit as st
2
- import pandas as pd
3
- import re
4
- import nltk
5
- from PIL import Image
6
- import os
7
- import numpy as np
8
- import seaborn as sns
9
- from wordcloud import WordCloud, STOPWORDS
10
- from nltk.corpus import stopwords
11
- import datasets
12
- from datasets import load_dataset
13
- import matplotlib.pyplot as plt
14
- import sklearn
15
- from sklearn.preprocessing import LabelEncoder
16
- sns.set_palette("RdBu")
17
- # loading dataset
18
- dataset = load_dataset("merve/poetry", streaming=True)
19
- df = pd.DataFrame.from_dict(dataset["train"])
20
 
21
-
22
- d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
23
- nltk.download("stopwords")
24
- stop = stopwords.words('english')
25
-
26
- # standardizing dataset by removing special characters and lowercasing
27
-
28
- def standardize(text, remove_digits=True):
29
- text=re.sub('[^a-zA-Z\d\s]', '',text)
30
- text = text.lower()
31
-
32
- return text
33
- st.set_option('deprecation.showPyplotGlobalUse', False)
34
- st.write("Poetry dataset, content column cleaned from special characters and lowercased")
35
- df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
36
- df.content=df.content.apply(standardize)
37
- st.dataframe(df)
38
-
39
- st.subheader("Visualization on dataset statistics")
40
-
41
- st.write("Number of poems written in each type")
42
- sns.catplot(x="type", data=df, kind="count")
43
- plt.xticks(rotation=0)
44
- st.pyplot()
45
-
46
- st.write("Number of poems for each age")
47
- sns.catplot(x="age", data=df, kind="count")
48
- plt.xticks(rotation=0)
49
- st.pyplot()
50
-
51
- st.write("Number of poems for each author")
52
- sns.catplot(x="author", data=df, kind="count", aspect = 4)
53
- plt.xticks(rotation=90)
54
- st.pyplot()
55
-
56
- # distributions of poem types according to ages and authors
57
- st.write("Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems and nature themed poems became popular later")
58
- le = LabelEncoder()
59
-
60
- df.author = le.fit_transform(df.author)
61
- sns.catplot(x="age", y="author",hue="type", data=df)
62
- st.pyplot()
63
-
64
-
65
- #words = df.content.str.split(expand=True).unstack().value_counts()
66
- # most appearing words other than stop words
67
- words = df.content.str.split(expand=True).unstack().value_counts()
68
- renaissance = df.content.loc[df.age == "Renaissance"].str.split(expand=True).unstack().value_counts()
69
- modern = df.content.loc[df.age == "Modern"].str.split(expand=True).unstack().value_counts()
70
- st.subheader("Visualizing content")
71
- mask = np.array(Image.open(os.path.join(d, "poet.png")))
72
-
73
- import matplotlib.pyplot as plt
74
- def word_cloud(content, title):
75
- wc = WordCloud(background_color="white", max_words=200,contour_width=3,
76
- stopwords=STOPWORDS, max_font_size=50)
77
- wc.generate(" ".join(content.index.values))
78
- fig = plt.figure(figsize=(10, 10))
79
- plt.title(title, fontsize=20)
80
- plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
81
- plt.axis('off')
82
- st.pyplot()
83
-
84
- st.subheader("Most appearing words excluding stopwords in poems according to ages")
85
- word_cloud(modern, "Word Cloud of Modern Poems")
86
-
87
- word_cloud(renaissance, "Word Cloud Renaissance Poems")
88
-
89
- # most appearing words including stopwords
90
- st.write("Most appearing words including stopwords")
91
- st.bar_chart(words[0:50])
 
1
+ gr.Interface.load("huggingface/bigscience/bloom-560m",title="Text Generator Five w/ Variables", description="Input your text, submit and the machine willoutput text.").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2