File size: 2,778 Bytes
0181645
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import boto3
import os
import urllib.parse
import logging
from botocore.exceptions import NoCredentialsError, ClientError

#bucket_name = "document-ingestion-drive-dev"
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY')



def get_s3_client():
    try:
        s3_client = boto3.client(
            's3',
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            region_name='us-west-2'  
        )
        logging.info("S3 client initialized successfully.")
        return s3_client
    except NoCredentialsError as e:
        logging.error(f"Failed to initialize S3 client: {str(e)}")
        raise
    except Exception as e:
        logging.error(f"Failed to initialize S3 client: {str(e)}")
        raise


def read_s3_file(bucket_name, key):
    try:
        logging.info(f"Reading file from S3: bucket={bucket_name}, key={key}")
        s3 = get_s3_client()
        response = s3.get_object(Bucket=bucket_name, Key=key)
        
        content = response['Body'].read()
        metadata = response.get('Metadata', {})
        
        # Attempt to get the file format from metadata
        file_format = metadata.get('file_format')  # Assuming 'file_format' is set as custom metadata
        if not file_format:
            # Fallback to using Content-Type if 'file_format' is not set in metadata
            content_type = response.get('ContentType')
            if content_type:
                if 'word' in content_type:
                    file_format = 'docx'
                elif 'pdf' in content_type:
                    file_format = 'pdf'
                elif 'text' in content_type:
                    file_format = 'txt'
                else:
                    file_format = 'unknown'
            else:
                raise ValueError("File format could not be determined from metadata or Content-Type.")
        
        logging.info(f"File read successfully from S3: bucket={bucket_name}, key={key}, format={file_format}")
        return content, metadata, file_format
    except s3.exceptions.NoSuchKey:
        logging.error(f"File not found in S3: bucket={bucket_name}, key={key}")
        raise FileNotFoundError(f"File not found: bucket={bucket_name}, key={key}")
    except NoCredentialsError:
        logging.error("AWS credentials not found.")
        raise PermissionError("AWS credentials not found.")
    except ClientError as e:
        logging.error(f"Error reading file from S3: {str(e)}")
        raise
    except Exception as e:
        logging.error(f"Error reading file from S3: {str(e)}")
        raise


# Initialize logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')