|
|
|
import logging |
|
logger = logging.getLogger(__name__) |
|
|
|
import json |
|
import os |
|
import re |
|
from deep_translator import GoogleTranslator |
|
from gematria import calculate_gematria |
|
import math |
|
import glob |
|
import math |
|
|
|
def process_json_files(start=1, end=52, step=1, rounds="1,-1", length=0, tlang="en", strip_spaces=True, |
|
strip_in_braces=True, strip_diacritics=True, average_compile=False, translate=False): |
|
base_path = "texts/tripitaka" |
|
translator = GoogleTranslator(source='ne', target=tlang) |
|
results = [] |
|
|
|
for i in range(start, end + 1): |
|
file_pattern = f"{base_path}/{i:02}*.json" |
|
for file_name in glob.glob(file_pattern): |
|
try: |
|
with open(file_name, 'r', encoding='utf-8') as file: |
|
data = json.load(file) |
|
full_text = "" |
|
for gatha in data.get("gathas", []): |
|
full_text += gatha + " " |
|
|
|
clean_text = full_text |
|
if strip_in_braces: |
|
clean_text = re.sub(r"\[.*?\]", "", clean_text, flags=re.DOTALL) |
|
|
|
if strip_diacritics: |
|
clean_text = re.sub(r'[^\u0900-\u097F\s]', '', clean_text) |
|
clean_text = re.sub(r'[\u0951-\u0954\u0964\u0965]+', '', clean_text) |
|
clean_text = re.sub(r'[०१२३४५६७८९]+', '', clean_text) |
|
clean_text = clean_text.replace(":", "") |
|
clean_text = clean_text.replace("?", "") |
|
clean_text = clean_text.replace("!", "") |
|
clean_text = clean_text.replace("-", "") |
|
clean_text = clean_text.replace("'", "") |
|
|
|
|
|
clean_text = clean_text.replace("\n\n ", " ") |
|
clean_text = clean_text.replace("\n", " ") |
|
clean_text = re.sub(r'\s+', ' ', clean_text) |
|
|
|
if strip_spaces: |
|
clean_text = clean_text.replace(" ", "") |
|
|
|
text_length = len(clean_text) |
|
|
|
selected_characters_per_round = {} |
|
for round_num in map(int, rounds.split(',')): |
|
if not (round_num == 1 and step > text_length) and not (round_num == -1 and step > text_length): |
|
if round_num > 0: |
|
current_position = step - 1 |
|
else: |
|
current_position = text_length - 1 if step == 1 else text_length - step |
|
|
|
completed_rounds = 0 |
|
selected_characters = "" |
|
|
|
while completed_rounds < abs(round_num): |
|
selected_characters += clean_text[current_position % text_length] |
|
current_position += step if round_num > 0 else -step |
|
|
|
if (round_num > 0 and current_position >= text_length * (completed_rounds + 1)) or \ |
|
(round_num < 0 and current_position < text_length * completed_rounds -1): |
|
completed_rounds += 1 |
|
|
|
selected_characters_per_round[round_num] = selected_characters |
|
|
|
if average_compile and len(selected_characters_per_round) > 1: |
|
result_text = "" |
|
keys = sorted(selected_characters_per_round.keys()) |
|
for j in range(len(keys) - 1): |
|
result_text = average_gematria(selected_characters_per_round[keys[j]], |
|
selected_characters_per_round[keys[j + 1]]) |
|
else: |
|
result_text = ''.join(selected_characters_per_round.values()) |
|
|
|
if length != 0: |
|
result_text = result_text[:length] |
|
|
|
translated_text = translator.translate(result_text) if result_text and translate else "" |
|
|
|
if result_text: |
|
results.append({ |
|
"book": f"Tripitaka {i}.", |
|
"title": f'{data.get("title")} {data.get("book_name")} {data.get("chapter")}', |
|
"result_text": result_text, |
|
"result_sum": calculate_gematria(result_text), |
|
"translated_text": translated_text, |
|
"source_language": "mr", |
|
}) |
|
|
|
except (FileNotFoundError, json.JSONDecodeError, KeyError) as e: |
|
results.append({"error": f"Error processing {file_name}: {e}"}) |
|
|
|
return results |
|
|