Spaces:

hienntd
/

TextClassification-PhoBERT

Running

App Files Files Community

TextClassification-PhoBERT / preprocess_data.py

hienntd

add code

c2a30b3 about 1 year ago

raw

history blame contribute delete

2.7 kB

	import phonlp
	import underthesea
	from pyspark.sql import SparkSession
	from pyspark.sql.functions import udf, StringType
	import re

	# Paths to original and processed data files
	ORIGINAL_DATA = "./data/news_v2/news_v2.json"
	PROCESSED_DATA = "./data/processed_data/final_data.json"

	# Load NLP model
	nlp_model = phonlp.load(save_dir="./phonlp")

	# Initialize SparkSession
	spark = SparkSession.builder \
	.appName("Preprocessing") \
	.master("local[*]") \
	.config("spark.executor.memory", "8g") \
	.config("spark.executor.instances", "64") \
	.config("spark.executor.cores", "1") \
	.config("spark.memory.offHeap.enabled", True) \
	.config("spark.driver.memory", "50g") \
	.config("spark.memory.offHeap.size", "16g") \
	.config("spark.ui.showConsoleProgress", False) \
	.config("spark.driver.maxResultSize", "8g") \
	.config("spark.log.level", "ERROR") \
	.getOrCreate()

	print("Loading data....")
	df = spark.read.json(ORIGINAL_DATA)

	# Function to preprocess text
	def preprocess_text(text):
	text = re.sub(r'[^\w\s.]', '', text) # Remove special characters
	# Tokenize text into sentences
	sentences = underthesea.sent_tokenize(text)

	# List to store preprocessed words
	preprocessed_words = []

	# Iterate through each sentence
	for sentence in sentences:
	try:
	word_tokens = underthesea.word_tokenize(sentence, format="text")
	# Tokenize words and perform POS tagging
	tags = nlp_model.annotate(word_tokens, batch_size=64)

	# Filter words based on POS tags
	filtered_words = [word.lower() for word, tag in zip(tags[0][0], tags[1][0]) if tag[0] not in ['M', 'X', 'CH']
	and word not in ["'", ","]]

	# Append filtered words to the result list
	preprocessed_words.extend(filtered_words)
	except Exception as e:
	pass

	# Convert list of words to string and return
	return ' '.join(preprocessed_words)

	# Register preprocess_text function as a Spark UDF
	preprocess_udf = udf(lambda text: preprocess_text(text), StringType())

	# Add "processed_content" column to DataFrame by applying preprocess_text function to "content" column
	df_processed = df.withColumn("processed_content", preprocess_udf(df["content"]))

	# Select "processed_content" and "category" columns from DataFrame
	selected_columns = ["processed_content", "category"]
	df_selected = df_processed.select(selected_columns)

	# Number of partitions
	num_partitions = 1024

	# Write DataFrame with specified number of partitions
	df_selected.repartition(num_partitions).coalesce(1).write.json(PROCESSED_DATA)