Spaces:
Runtime error
Runtime error
Upload Utils.py
Browse files
App_Function_Libraries/Utils/Utils.py
CHANGED
|
@@ -20,13 +20,16 @@
|
|
| 20 |
####################
|
| 21 |
#
|
| 22 |
# Import necessary libraries
|
|
|
|
| 23 |
import configparser
|
| 24 |
import hashlib
|
| 25 |
import json
|
| 26 |
import logging
|
| 27 |
import os
|
| 28 |
import re
|
|
|
|
| 29 |
import time
|
|
|
|
| 30 |
from datetime import timedelta
|
| 31 |
from typing import Union, AnyStr
|
| 32 |
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
|
@@ -40,24 +43,27 @@ from tqdm import tqdm
|
|
| 40 |
#
|
| 41 |
# Function Definitions
|
| 42 |
|
| 43 |
-
def extract_text_from_segments(segments):
|
| 44 |
logging.debug(f"Segments received: {segments}")
|
| 45 |
logging.debug(f"Type of segments: {type(segments)}")
|
| 46 |
|
| 47 |
-
def extract_text_recursive(data):
|
| 48 |
if isinstance(data, dict):
|
|
|
|
|
|
|
|
|
|
| 49 |
for key, value in data.items():
|
| 50 |
if key == 'Text':
|
| 51 |
return value
|
| 52 |
elif isinstance(value, (dict, list)):
|
| 53 |
-
result = extract_text_recursive(value)
|
| 54 |
if result:
|
| 55 |
return result
|
| 56 |
elif isinstance(data, list):
|
| 57 |
-
return '
|
| 58 |
return None
|
| 59 |
|
| 60 |
-
text = extract_text_recursive(segments)
|
| 61 |
|
| 62 |
if text:
|
| 63 |
return text.strip()
|
|
@@ -367,7 +373,9 @@ def format_metadata_as_text(metadata):
|
|
| 367 |
else:
|
| 368 |
formatted_value = str(value)
|
| 369 |
|
| 370 |
-
|
|
|
|
|
|
|
| 371 |
return formatted_text.strip()
|
| 372 |
|
| 373 |
# # Example usage:
|
|
@@ -494,7 +502,7 @@ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5
|
|
| 494 |
def create_download_directory(title):
|
| 495 |
base_dir = "Results"
|
| 496 |
# Remove characters that are illegal in Windows filenames and normalize
|
| 497 |
-
safe_title = normalize_title(title)
|
| 498 |
logging.debug(f"{title} successfully normalized")
|
| 499 |
session_path = os.path.join(base_dir, safe_title)
|
| 500 |
if not os.path.exists(session_path):
|
|
@@ -507,16 +515,28 @@ def create_download_directory(title):
|
|
| 507 |
|
| 508 |
def safe_read_file(file_path):
|
| 509 |
encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
for encoding in encodings:
|
| 511 |
try:
|
| 512 |
-
|
| 513 |
-
|
|
|
|
| 514 |
except UnicodeDecodeError:
|
| 515 |
continue
|
| 516 |
-
|
| 517 |
-
return f"File not found: {file_path}"
|
| 518 |
-
except Exception as e:
|
| 519 |
-
return f"An error occurred: {e}"
|
| 520 |
return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
|
| 521 |
|
| 522 |
#
|
|
@@ -591,13 +611,27 @@ def verify_checksum(file_path, expected_checksum):
|
|
| 591 |
return sha256_hash.hexdigest() == expected_checksum
|
| 592 |
|
| 593 |
|
| 594 |
-
def normalize_title(title):
|
| 595 |
# Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
|
| 596 |
title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 601 |
|
| 602 |
|
| 603 |
def clean_youtube_url(url):
|
|
@@ -640,6 +674,20 @@ def format_transcription(content):
|
|
| 640 |
|
| 641 |
return formatted_content
|
| 642 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
def format_file_path(file_path, fallback_path=None):
|
| 645 |
if file_path and os.path.exists(file_path):
|
|
@@ -696,7 +744,32 @@ def format_text_with_line_breaks(text):
|
|
| 696 |
#
|
| 697 |
# File Handling Functions
|
| 698 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
|
|
|
|
|
|
|
| 700 |
|
| 701 |
#
|
| 702 |
# End of File Handling Functions
|
|
|
|
| 20 |
####################
|
| 21 |
#
|
| 22 |
# Import necessary libraries
|
| 23 |
+
import chardet
|
| 24 |
import configparser
|
| 25 |
import hashlib
|
| 26 |
import json
|
| 27 |
import logging
|
| 28 |
import os
|
| 29 |
import re
|
| 30 |
+
import tempfile
|
| 31 |
import time
|
| 32 |
+
import uuid
|
| 33 |
from datetime import timedelta
|
| 34 |
from typing import Union, AnyStr
|
| 35 |
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
|
|
|
|
| 43 |
#
|
| 44 |
# Function Definitions
|
| 45 |
|
| 46 |
+
def extract_text_from_segments(segments, include_timestamps=True):
|
| 47 |
logging.debug(f"Segments received: {segments}")
|
| 48 |
logging.debug(f"Type of segments: {type(segments)}")
|
| 49 |
|
| 50 |
+
def extract_text_recursive(data, include_timestamps):
|
| 51 |
if isinstance(data, dict):
|
| 52 |
+
text = data.get('Text', '')
|
| 53 |
+
if include_timestamps and 'Time_Start' in data and 'Time_End' in data:
|
| 54 |
+
return f"{data['Time_Start']:.2f}s - {data['Time_End']:.2f}s | {text}"
|
| 55 |
for key, value in data.items():
|
| 56 |
if key == 'Text':
|
| 57 |
return value
|
| 58 |
elif isinstance(value, (dict, list)):
|
| 59 |
+
result = extract_text_recursive(value, include_timestamps)
|
| 60 |
if result:
|
| 61 |
return result
|
| 62 |
elif isinstance(data, list):
|
| 63 |
+
return '\n'.join(filter(None, [extract_text_recursive(item, include_timestamps) for item in data]))
|
| 64 |
return None
|
| 65 |
|
| 66 |
+
text = extract_text_recursive(segments, include_timestamps)
|
| 67 |
|
| 68 |
if text:
|
| 69 |
return text.strip()
|
|
|
|
| 373 |
else:
|
| 374 |
formatted_value = str(value)
|
| 375 |
|
| 376 |
+
# Replace underscores with spaces in the key name
|
| 377 |
+
formatted_key = key.replace('_', ' ').capitalize()
|
| 378 |
+
formatted_text += f"{formatted_key}: {formatted_value}\n"
|
| 379 |
return formatted_text.strip()
|
| 380 |
|
| 381 |
# # Example usage:
|
|
|
|
| 502 |
def create_download_directory(title):
|
| 503 |
base_dir = "Results"
|
| 504 |
# Remove characters that are illegal in Windows filenames and normalize
|
| 505 |
+
safe_title = normalize_title(title, preserve_spaces=False)
|
| 506 |
logging.debug(f"{title} successfully normalized")
|
| 507 |
session_path = os.path.join(base_dir, safe_title)
|
| 508 |
if not os.path.exists(session_path):
|
|
|
|
| 515 |
|
| 516 |
def safe_read_file(file_path):
|
| 517 |
encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
|
| 518 |
+
|
| 519 |
+
try:
|
| 520 |
+
with open(file_path, 'rb') as file:
|
| 521 |
+
raw_data = file.read()
|
| 522 |
+
except FileNotFoundError:
|
| 523 |
+
return f"File not found: {file_path}"
|
| 524 |
+
except Exception as e:
|
| 525 |
+
return f"An error occurred while reading the file: {e}"
|
| 526 |
+
|
| 527 |
+
# Use chardet to detect the encoding
|
| 528 |
+
detected = chardet.detect(raw_data)
|
| 529 |
+
if detected['encoding'] is not None:
|
| 530 |
+
encodings.insert(0, detected['encoding'])
|
| 531 |
+
|
| 532 |
for encoding in encodings:
|
| 533 |
try:
|
| 534 |
+
decoded_content = raw_data.decode(encoding)
|
| 535 |
+
if decoded_content.isprintable():
|
| 536 |
+
return decoded_content
|
| 537 |
except UnicodeDecodeError:
|
| 538 |
continue
|
| 539 |
+
|
|
|
|
|
|
|
|
|
|
| 540 |
return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
|
| 541 |
|
| 542 |
#
|
|
|
|
| 611 |
return sha256_hash.hexdigest() == expected_checksum
|
| 612 |
|
| 613 |
|
| 614 |
+
def normalize_title(title, preserve_spaces=False):
|
| 615 |
# Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
|
| 616 |
title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
|
| 617 |
+
|
| 618 |
+
if preserve_spaces:
|
| 619 |
+
# Replace special characters with underscores, but keep spaces
|
| 620 |
+
title = re.sub(r'[^\w\s\-.]', '_', title)
|
| 621 |
+
else:
|
| 622 |
+
# Replace special characters and spaces with underscores
|
| 623 |
+
title = re.sub(r'[^\w\-.]', '_', title)
|
| 624 |
+
|
| 625 |
+
# Replace multiple consecutive underscores with a single underscore
|
| 626 |
+
title = re.sub(r'_+', '_', title)
|
| 627 |
+
|
| 628 |
+
# Replace specific characters with underscores
|
| 629 |
+
title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '_').replace('*', '_').replace(
|
| 630 |
+
'?', '_').replace(
|
| 631 |
+
'<', '_').replace('>', '_').replace('|', '_')
|
| 632 |
+
|
| 633 |
+
return title.strip('_')
|
| 634 |
+
|
| 635 |
|
| 636 |
|
| 637 |
def clean_youtube_url(url):
|
|
|
|
| 674 |
|
| 675 |
return formatted_content
|
| 676 |
|
| 677 |
+
def sanitize_user_input(message):
|
| 678 |
+
"""
|
| 679 |
+
Removes or escapes '{{' and '}}' to prevent placeholder injection.
|
| 680 |
+
|
| 681 |
+
Args:
|
| 682 |
+
message (str): The user's message.
|
| 683 |
+
|
| 684 |
+
Returns:
|
| 685 |
+
str: Sanitized message.
|
| 686 |
+
"""
|
| 687 |
+
# Replace '{{' and '}}' with their escaped versions
|
| 688 |
+
message = re.sub(r'\{\{', '{ {', message)
|
| 689 |
+
message = re.sub(r'\}\}', '} }', message)
|
| 690 |
+
return message
|
| 691 |
|
| 692 |
def format_file_path(file_path, fallback_path=None):
|
| 693 |
if file_path and os.path.exists(file_path):
|
|
|
|
| 744 |
#
|
| 745 |
# File Handling Functions
|
| 746 |
|
| 747 |
+
# Track temp files for cleanup
|
| 748 |
+
temp_files = []
|
| 749 |
+
temp_file_paths = []
|
| 750 |
+
|
| 751 |
+
def save_temp_file(file):
|
| 752 |
+
global temp_files
|
| 753 |
+
temp_dir = tempfile.gettempdir()
|
| 754 |
+
temp_path = os.path.join(temp_dir, file.name)
|
| 755 |
+
with open(temp_path, 'wb') as f:
|
| 756 |
+
f.write(file.read())
|
| 757 |
+
temp_files.append(temp_path)
|
| 758 |
+
return temp_path
|
| 759 |
+
|
| 760 |
+
def cleanup_temp_files():
|
| 761 |
+
global temp_files
|
| 762 |
+
for file_path in temp_files:
|
| 763 |
+
if os.path.exists(file_path):
|
| 764 |
+
try:
|
| 765 |
+
os.remove(file_path)
|
| 766 |
+
logging.info(f"Removed temporary file: {file_path}")
|
| 767 |
+
except Exception as e:
|
| 768 |
+
logging.error(f"Failed to remove temporary file {file_path}: {e}")
|
| 769 |
+
temp_files.clear()
|
| 770 |
|
| 771 |
+
def generate_unique_id():
|
| 772 |
+
return f"uploaded_file_{uuid.uuid4()}"
|
| 773 |
|
| 774 |
#
|
| 775 |
# End of File Handling Functions
|