Spaces:
Running
Running
Yurii Paniv
commited on
Commit
·
8451e68
1
Parent(s):
b3b12c1
Add text converting sscripts for scorer
Browse files- .gitignore +3 -1
- scripts/extract_text_corpus.py +51 -0
- scripts/wiki_import.py +51 -0
.gitignore
CHANGED
|
@@ -129,4 +129,6 @@ dmypy.json
|
|
| 129 |
.pyre/
|
| 130 |
|
| 131 |
*.tflite
|
| 132 |
-
.DS_Store
|
|
|
|
|
|
|
|
|
| 129 |
.pyre/
|
| 130 |
|
| 131 |
*.tflite
|
| 132 |
+
.DS_Store
|
| 133 |
+
|
| 134 |
+
/data
|
scripts/extract_text_corpus.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import nltk
|
| 3 |
+
import re
|
| 4 |
+
nltk.download("punkt")
|
| 5 |
+
|
| 6 |
+
FOLDER = "../data/текст/"
|
| 7 |
+
OUT_FILE = "../data/texts.txt"
|
| 8 |
+
text_file = open(OUT_FILE, mode="a")
|
| 9 |
+
|
| 10 |
+
tokenizer = nltk.SpaceTokenizer()
|
| 11 |
+
paranthesis_regex = re.compile(r'\(.*\)')
|
| 12 |
+
allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
|
| 13 |
+
"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
|
| 14 |
+
|
| 15 |
+
for subdir, dirs, files in os.walk(FOLDER):
|
| 16 |
+
for file in files:
|
| 17 |
+
file_path = os.path.join(subdir, file)
|
| 18 |
+
print(file_path)
|
| 19 |
+
input_file = open(file_path)
|
| 20 |
+
try:
|
| 21 |
+
cleaned_text = input_file.read()
|
| 22 |
+
except:
|
| 23 |
+
input_file.close()
|
| 24 |
+
input_file = open(file_path, encoding="cp1251")
|
| 25 |
+
cleaned_text = input_file.read()
|
| 26 |
+
cleaned_text = cleaned_text.lower()
|
| 27 |
+
cleaned_text = paranthesis_regex.sub('', cleaned_text)
|
| 28 |
+
cleaned_text = cleaned_text.strip()
|
| 29 |
+
cleaned_text = cleaned_text.split(".")
|
| 30 |
+
out_text = []
|
| 31 |
+
for text in cleaned_text:
|
| 32 |
+
text = text.strip()
|
| 33 |
+
|
| 34 |
+
words = tokenizer.tokenize(text)
|
| 35 |
+
words = [i for i in words if i.isalnum()]
|
| 36 |
+
words = [i for i in words if not i.isdigit()]
|
| 37 |
+
words = [i for i in words if len(i) > 1]
|
| 38 |
+
if any([any(j not in allowed_chars for j in i) for i in words]):
|
| 39 |
+
continue
|
| 40 |
+
if len(words) == 0:
|
| 41 |
+
continue
|
| 42 |
+
out_text.append(
|
| 43 |
+
" ".join(words))
|
| 44 |
+
cleaned_text = "\n".join(out_text)
|
| 45 |
+
if cleaned_text == "":
|
| 46 |
+
continue
|
| 47 |
+
text_file.write(cleaned_text + "\n")
|
| 48 |
+
input_file.close()
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
text_file.close()
|
scripts/wiki_import.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from wiki_dump_reader import Cleaner, iterate
|
| 2 |
+
from os import remove
|
| 3 |
+
import nltk
|
| 4 |
+
import re
|
| 5 |
+
nltk.download("punkt")
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
remove("../data/wiki_text.txt")
|
| 9 |
+
text_file = open("../data/wiki_text.txt", mode="a")
|
| 10 |
+
|
| 11 |
+
tokenizer = nltk.SpaceTokenizer()
|
| 12 |
+
paranthesis_regex = re.compile(r'\(.*\)')
|
| 13 |
+
allowed_chars = ["а", "б", "в", "г", "ґ", "д", "е", "є", "ж", "з", "и", "і", "ї", "й", "к", "л",
|
| 14 |
+
"м", "н", "о", "п", "р", "с", "т", "у", "ф", "х", "ц", "ч", "ш", "щ", "ь", "ю", "я", "-", "'"]
|
| 15 |
+
|
| 16 |
+
cleaner = Cleaner()
|
| 17 |
+
for title, text in iterate('../data/ukwiki-20210320-pages-articles-multistream.xml'):
|
| 18 |
+
text = cleaner.clean_text(text)
|
| 19 |
+
cleaned_text, _ = cleaner.build_links(text)
|
| 20 |
+
cleaned_text = cleaned_text.lower()
|
| 21 |
+
cleaned_text = cleaned_text.replace(" ", " ")
|
| 22 |
+
cleaned_text = cleaned_text.replace("н. е.", "нашої ери")
|
| 23 |
+
cleaned_text = cleaned_text.replace("ім.", "імені")
|
| 24 |
+
cleaned_text = cleaned_text.replace("див.", "дивись")
|
| 25 |
+
cleaned_text = paranthesis_regex.sub('', cleaned_text)
|
| 26 |
+
cleaned_text = cleaned_text.strip()
|
| 27 |
+
cleaned_text = cleaned_text.split(".")
|
| 28 |
+
out_text = []
|
| 29 |
+
for text in cleaned_text:
|
| 30 |
+
text = text.strip()
|
| 31 |
+
if text.endswith(", що вивчає"):
|
| 32 |
+
continue
|
| 33 |
+
if text.startswith("redirect") or text.startswith("перенаправлення"):
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
words = tokenizer.tokenize(text)
|
| 37 |
+
words = [i for i in words if i.isalnum()]
|
| 38 |
+
words = [i for i in words if not i.isdigit()]
|
| 39 |
+
words = [i for i in words if len(i) > 1]
|
| 40 |
+
if any([any(j not in allowed_chars for j in i) for i in words]):
|
| 41 |
+
continue
|
| 42 |
+
if len(words) == 0:
|
| 43 |
+
continue
|
| 44 |
+
out_text.append(
|
| 45 |
+
" ".join(words))
|
| 46 |
+
cleaned_text = "\n".join(out_text)
|
| 47 |
+
if cleaned_text == "":
|
| 48 |
+
continue
|
| 49 |
+
text_file.write(cleaned_text + "\n")
|
| 50 |
+
|
| 51 |
+
text_file.close()
|