Spaces:

oceansweep
/

tldw

Runtime error

App Files Files Community

oceansweep commited on Oct 8, 2024

Commit

ee6aa85

verified ·

1 Parent(s): 8619cce

Upload Utils.py

Browse files

Files changed (1) hide show

App_Function_Libraries/Utils/Utils.py +91 -18

App_Function_Libraries/Utils/Utils.py CHANGED Viewed

@@ -20,13 +20,16 @@
 ####################
 #
 # Import necessary libraries
 import configparser
 import hashlib
 import json
 import logging
 import os
 import re
 import time
 from datetime import timedelta
 from typing import Union, AnyStr
 from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
@@ -40,24 +43,27 @@ from tqdm import tqdm
 #
 # Function Definitions
-def extract_text_from_segments(segments):
     logging.debug(f"Segments received: {segments}")
     logging.debug(f"Type of segments: {type(segments)}")
-    def extract_text_recursive(data):
         if isinstance(data, dict):
             for key, value in data.items():
                 if key == 'Text':
                     return value
                 elif isinstance(value, (dict, list)):
-                    result = extract_text_recursive(value)
                     if result:
                         return result
         elif isinstance(data, list):
-            return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
         return None
-    text = extract_text_recursive(segments)
     if text:
         return text.strip()
@@ -367,7 +373,9 @@ def format_metadata_as_text(metadata):
             else:
                 formatted_value = str(value)
-            formatted_text += f"{key.capitalize()}: {formatted_value}\n"
     return formatted_text.strip()
 # # Example usage:
@@ -494,7 +502,7 @@ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5
 def create_download_directory(title):
     base_dir = "Results"
     # Remove characters that are illegal in Windows filenames and normalize
-    safe_title = normalize_title(title)
     logging.debug(f"{title} successfully normalized")
     session_path = os.path.join(base_dir, safe_title)
     if not os.path.exists(session_path):
@@ -507,16 +515,28 @@ def create_download_directory(title):
 def safe_read_file(file_path):
     encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
     for encoding in encodings:
         try:
-            with open(file_path, 'r', encoding=encoding) as file:
-                return file.read()
         except UnicodeDecodeError:
             continue
-        except FileNotFoundError:
-            return f"File not found: {file_path}"
-        except Exception as e:
-            return f"An error occurred: {e}"
     return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
 #
@@ -591,13 +611,27 @@ def verify_checksum(file_path, expected_checksum):
     return sha256_hash.hexdigest() == expected_checksum
-def normalize_title(title):
     # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
     title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
-    title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
-                                                                                                                   '').replace(
-        '<', '').replace('>', '').replace('|', '')
-    return title
 def clean_youtube_url(url):
@@ -640,6 +674,20 @@ def format_transcription(content):
     return formatted_content
 def format_file_path(file_path, fallback_path=None):
     if file_path and os.path.exists(file_path):
@@ -696,7 +744,32 @@ def format_text_with_line_breaks(text):
 #
 # File Handling Functions
 #
 # End of File Handling Functions

 ####################
 #
 # Import necessary libraries
+import chardet
 import configparser
 import hashlib
 import json
 import logging
 import os
 import re
+import tempfile
 import time
+import uuid
 from datetime import timedelta
 from typing import Union, AnyStr
 from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
 #
 # Function Definitions
+def extract_text_from_segments(segments, include_timestamps=True):
     logging.debug(f"Segments received: {segments}")
     logging.debug(f"Type of segments: {type(segments)}")
+    def extract_text_recursive(data, include_timestamps):
         if isinstance(data, dict):
+            text = data.get('Text', '')
+            if include_timestamps and 'Time_Start' in data and 'Time_End' in data:
+                return f"{data['Time_Start']:.2f}s - {data['Time_End']:.2f}s | {text}"
             for key, value in data.items():
                 if key == 'Text':
                     return value
                 elif isinstance(value, (dict, list)):
+                    result = extract_text_recursive(value, include_timestamps)
                     if result:
                         return result
         elif isinstance(data, list):
+            return '\n'.join(filter(None, [extract_text_recursive(item, include_timestamps) for item in data]))
         return None
+    text = extract_text_recursive(segments, include_timestamps)
     if text:
         return text.strip()
             else:
                 formatted_value = str(value)
+            # Replace underscores with spaces in the key name
+            formatted_key = key.replace('_', ' ').capitalize()
+            formatted_text += f"{formatted_key}: {formatted_value}\n"
     return formatted_text.strip()
 # # Example usage:
 def create_download_directory(title):
     base_dir = "Results"
     # Remove characters that are illegal in Windows filenames and normalize
+    safe_title = normalize_title(title, preserve_spaces=False)
     logging.debug(f"{title} successfully normalized")
     session_path = os.path.join(base_dir, safe_title)
     if not os.path.exists(session_path):
 def safe_read_file(file_path):
     encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
+    try:
+        with open(file_path, 'rb') as file:
+            raw_data = file.read()
+    except FileNotFoundError:
+        return f"File not found: {file_path}"
+    except Exception as e:
+        return f"An error occurred while reading the file: {e}"
+    # Use chardet to detect the encoding
+    detected = chardet.detect(raw_data)
+    if detected['encoding'] is not None:
+        encodings.insert(0, detected['encoding'])
     for encoding in encodings:
         try:
+            decoded_content = raw_data.decode(encoding)
+            if decoded_content.isprintable():
+                return decoded_content
         except UnicodeDecodeError:
             continue
     return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
 #
     return sha256_hash.hexdigest() == expected_checksum
+def normalize_title(title, preserve_spaces=False):
     # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
     title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
+    if preserve_spaces:
+        # Replace special characters with underscores, but keep spaces
+        title = re.sub(r'[^\w\s\-.]', '_', title)
+    else:
+        # Replace special characters and spaces with underscores
+        title = re.sub(r'[^\w\-.]', '_', title)
+    # Replace multiple consecutive underscores with a single underscore
+    title = re.sub(r'_+', '_', title)
+    # Replace specific characters with underscores
+    title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '_').replace('*', '_').replace(
+        '?', '_').replace(
+        '<', '_').replace('>', '_').replace('|', '_')
+    return title.strip('_')
 def clean_youtube_url(url):
     return formatted_content
+def sanitize_user_input(message):
+    """
+    Removes or escapes '{{' and '}}' to prevent placeholder injection.
+    Args:
+        message (str): The user's message.
+    Returns:
+        str: Sanitized message.
+    """
+    # Replace '{{' and '}}' with their escaped versions
+    message = re.sub(r'\{\{', '{ {', message)
+    message = re.sub(r'\}\}', '} }', message)
+    return message
 def format_file_path(file_path, fallback_path=None):
     if file_path and os.path.exists(file_path):
 #
 # File Handling Functions
+# Track temp files for cleanup
+temp_files = []
+temp_file_paths = []
+def save_temp_file(file):
+    global temp_files
+    temp_dir = tempfile.gettempdir()
+    temp_path = os.path.join(temp_dir, file.name)
+    with open(temp_path, 'wb') as f:
+        f.write(file.read())
+    temp_files.append(temp_path)
+    return temp_path
+def cleanup_temp_files():
+    global temp_files
+    for file_path in temp_files:
+        if os.path.exists(file_path):
+            try:
+                os.remove(file_path)
+                logging.info(f"Removed temporary file: {file_path}")
+            except Exception as e:
+                logging.error(f"Failed to remove temporary file {file_path}: {e}")
+    temp_files.clear()
+def generate_unique_id():
+    return f"uploaded_file_{uuid.uuid4()}"
 #
 # End of File Handling Functions