|
import phonlp |
|
import underthesea |
|
from pyspark.sql import SparkSession |
|
from pyspark.sql.functions import udf, StringType |
|
import re |
|
|
|
|
|
ORIGINAL_DATA = "./data/news_v2/news_v2.json" |
|
PROCESSED_DATA = "./data/processed_data/final_data.json" |
|
|
|
|
|
nlp_model = phonlp.load(save_dir="./phonlp") |
|
|
|
|
|
spark = SparkSession.builder \ |
|
.appName("Preprocessing") \ |
|
.master("local[*]") \ |
|
.config("spark.executor.memory", "8g") \ |
|
.config("spark.executor.instances", "64") \ |
|
.config("spark.executor.cores", "1") \ |
|
.config("spark.memory.offHeap.enabled", True) \ |
|
.config("spark.driver.memory", "50g") \ |
|
.config("spark.memory.offHeap.size", "16g") \ |
|
.config("spark.ui.showConsoleProgress", False) \ |
|
.config("spark.driver.maxResultSize", "8g") \ |
|
.config("spark.log.level", "ERROR") \ |
|
.getOrCreate() |
|
|
|
print("Loading data....") |
|
df = spark.read.json(ORIGINAL_DATA) |
|
|
|
|
|
def preprocess_text(text): |
|
text = re.sub(r'[^\w\s.]', '', text) |
|
|
|
sentences = underthesea.sent_tokenize(text) |
|
|
|
|
|
preprocessed_words = [] |
|
|
|
|
|
for sentence in sentences: |
|
try: |
|
word_tokens = underthesea.word_tokenize(sentence, format="text") |
|
|
|
tags = nlp_model.annotate(word_tokens, batch_size=64) |
|
|
|
|
|
filtered_words = [word.lower() for word, tag in zip(tags[0][0], tags[1][0]) if tag[0] not in ['M', 'X', 'CH'] |
|
and word not in ["'", ","]] |
|
|
|
|
|
preprocessed_words.extend(filtered_words) |
|
except Exception as e: |
|
pass |
|
|
|
|
|
return ' '.join(preprocessed_words) |
|
|
|
|
|
preprocess_udf = udf(lambda text: preprocess_text(text), StringType()) |
|
|
|
|
|
df_processed = df.withColumn("processed_content", preprocess_udf(df["content"])) |
|
|
|
|
|
selected_columns = ["processed_content", "category"] |
|
df_selected = df_processed.select(selected_columns) |
|
|
|
|
|
num_partitions = 1024 |
|
|
|
|
|
df_selected.repartition(num_partitions).coalesce(1).write.json(PROCESSED_DATA) |
|
|