Spaces:
Sleeping
Sleeping
File size: 4,303 Bytes
fb95c43 a19d954 08754e8 fb95c43 a19d954 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import hashlib
import datetime
import os
import uuid
from typing import Dict
import re
# from rag_app.utils import logger
# logger = logger.get_console_logger("utils")
def extract_urls(data_list):
"""
Extracts URLs from a list of of dictionaries.
Parameters:
- formatted_list (list): A list of dictionaries, each containing 'Title:', 'link:', and 'summary:'.
Returns:
- list: A list of URLs extracted from the dictionaries.
"""
urls = []
print(data_list)
for item in data_list:
try:
# Find the start and end indices of the URL
lower_case = item.lower()
link_prefix = 'link: '
summary_prefix = ', summary:'
start_idx = lower_case.index(link_prefix) + len(link_prefix)
end_idx = lower_case.index(summary_prefix, start_idx)
# Extract the URL using the indices found
url = item[start_idx:end_idx]
urls.append(url)
except ValueError:
# Handles the case where 'link: ' or ', summary:' is not found in the string
print("Could not find a URL in the item:", item)
last_sources = urls[-3:]
return last_sources
def format_search_results(search_results):
"""
Formats a list of dictionaries containing search results into a list of strings.
Each dictionary is expected to have the keys 'title', 'link', and 'snippet'.
Parameters:
- search_results (list): A list of dictionaries, each containing 'title', 'link', and 'snippet'.
Returns:
- list: A list of formatted strings based on the search results.
"""
if len(search_results)>1:
formatted_results = [
"Title: {title}, Link: {link}, Summary: {snippet}".format(**i)
for i in search_results
]
return formatted_results
def parse_list_to_dicts(items: list) -> list:
parsed_items = []
for item in items:
# Extract title, link, and summary from each string
title_start = item.find('Title: ') + len('Title: ')
link_start = item.find('Link: ') + len('Link: ')
summary_start = item.find('Summary: ') + len('Summary: ')
title_end = item.find(', Link: ')
link_end = item.find(', Summary: ')
summary_end = len(item)
title = item[title_start:title_end]
link = item[link_start:link_end]
summary = item[summary_start:summary_end]
# Use the hash_text function for the hash_id
hash_id = hash_text(link)
# Construct the dictionary for each item
parsed_item = {
"url": link,
"title": title,
"hash_id": hash_id,
"summary": summary
}
parsed_items.append(parsed_item)
return parsed_items
def hash_text(text: str) -> str:
return hashlib.md5(text.encode()).hexdigest()
def convert_timestamp_to_datetime(timestamp: str) -> str:
return datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d %H:%M:%S")
def create_folder_if_not_exists(folder_path: str) -> None:
"""
Create a folder if it doesn't already exist.
Args:
- folder_path (str): The path of the folder to create.
"""
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"Folder '{folder_path}' created.")
else:
print(f"Folder '{folder_path}' already exists.")
def generate_uuid() -> str:
"""
Generate a UUID (Universally Unique Identifier) and return it as a string.
Returns:
str: A UUID string.
"""
return str(uuid.uuid4())
def extract_responses(text: str) -> Dict[str, str]:
"""
Extracts the user response and AI response from the provided text.
Args:
text (str): The input text containing user and AI responses.
Returns:
Dict[str, str]: A dictionary with keys 'USER' and 'AI' containing the respective responses.
"""
user_pattern = re.compile(r'USER: (.*?) \n', re.DOTALL)
ai_pattern = re.compile(r'AI: (.*?)$', re.DOTALL)
user_match = user_pattern.search(text)
ai_match = ai_pattern.search(text)
responses = {
"USER": user_match.group(1) if user_match else "",
"AI": ai_match.group(1) if ai_match else ""
}
return responses |