Karthikeyen92 commited on
Commit
88e348e
·
verified ·
1 Parent(s): f761616

Update py/handle_files.py

Browse files
Files changed (1) hide show
  1. py/handle_files.py +102 -102
py/handle_files.py CHANGED
@@ -1,103 +1,103 @@
1
-
2
-
3
- from datetime import datetime
4
- import json
5
- import os
6
- import pickle
7
- from typing import List
8
- from langchain.schema import Document
9
- import pandas as pd
10
-
11
- def create_files(social_media_data):
12
- folder_path = 'Stock Sentiment Analysis/files'
13
-
14
- if not os.path.exists(folder_path):
15
- os.makedirs(folder_path)
16
-
17
- # Save dictionary to a file
18
- with open(folder_path+'/social_media_data.json', 'w') as f:
19
- json.dump(social_media_data, f)
20
-
21
- # Convert the data to a pandas DataFrame
22
- df = pd.DataFrame(social_media_data)
23
- df.head()
24
-
25
- # Exporting the data to a CSV file
26
- file_path = folder_path+"/social_media_data.csv"
27
- df.to_csv(file_path, index=False)
28
-
29
- df.to_pickle(folder_path+"/social_media_data.pkl")
30
-
31
- def fetch_social_media_data():
32
- with open('Stock Sentiment Analysis/files/social_media_data.json', 'r') as file:
33
- data = json.load(file)
34
- social_media_document = []
35
- for item in data:
36
- social_media_document.append(Document(
37
- page_content=str(item["page_content"]),
38
- metadata={"platform":item["platform"],
39
- "company":item["company"],
40
- "ingestion_timestamp":datetime.now().isoformat(),
41
- "word_count":len(item["page_content"]["content"])
42
- }))
43
- return social_media_document
44
-
45
- def save_ingested_data(ingested_data):
46
- # Save the list to a file
47
- with open('Stock Sentiment Analysis/files/ingested_data.pkl', 'wb') as file:
48
- pickle.dump(ingested_data, file)
49
-
50
- def save_analysed_data(analysed_data):
51
- # Save the list to a file
52
- with open('Stock Sentiment Analysis/files/analysed_data.pkl', 'wb') as file:
53
- pickle.dump(analysed_data, file)
54
-
55
- def get_ingested_data():
56
- # Load the list from the file
57
- with open('Stock Sentiment Analysis/files/ingested_data.pkl', 'rb') as file:
58
- loaded_documents = pickle.load(file)
59
- return loaded_documents
60
-
61
- def get_analysed_data():
62
- # Load the list from the file
63
- with open('Stock Sentiment Analysis/files/analysed_data.pkl', 'rb') as file:
64
- loaded_documents = pickle.load(file)
65
- return loaded_documents
66
-
67
- def sample_documents(documents: List[Document], n: int) -> List[Document]:
68
- """
69
- Samples `n` entries for each unique `"platform"` and `"company"` metadata combination from the input `Document[]`.
70
-
71
- Args:
72
- documents (List[Document]): The input list of `Document` objects.
73
- n (int): The number of entries to sample for each unique metadata combination.
74
-
75
- Returns:
76
- List[Document]: A new list of `Document` objects, with `n` entries per unique metadata combination.
77
- """
78
- # Create a dictionary to store the sampled documents per metadata combination
79
- sampled_docs = {}
80
-
81
- for doc in documents:
82
- combo = (doc.metadata["platform"], doc.metadata["company"])
83
- if combo not in sampled_docs:
84
- sampled_docs[combo] = []
85
-
86
- # Add the document to the list for its metadata combination, up to n entries
87
- if len(sampled_docs[combo]) < n:
88
- sampled_docs[combo].append(doc)
89
-
90
- # Flatten the dictionary into a single list
91
- return [doc for docs in sampled_docs.values() for doc in docs]
92
-
93
- def to_documents(data) -> List[Document]:
94
- social_media_document = []
95
- for item in data:
96
- social_media_document.append(Document(
97
- page_content=str(item["page_content"]),
98
- metadata={"platform":item["platform"],
99
- "company":item["company"],
100
- "ingestion_timestamp":datetime.now().isoformat(),
101
- "word_count":len(item["page_content"]["content"])
102
- }))
103
  return social_media_document
 
1
+
2
+
3
+ from datetime import datetime
4
+ import json
5
+ import os
6
+ import pickle
7
+ from typing import List
8
+ from langchain.schema import Document
9
+ import pandas as pd
10
+
11
+ def create_files(social_media_data):
12
+ folder_path = 'files'
13
+
14
+ if not os.path.exists(folder_path):
15
+ os.makedirs(folder_path)
16
+
17
+ # Save dictionary to a file
18
+ with open(folder_path+'/social_media_data.json', 'w') as f:
19
+ json.dump(social_media_data, f)
20
+
21
+ # Convert the data to a pandas DataFrame
22
+ df = pd.DataFrame(social_media_data)
23
+ df.head()
24
+
25
+ # Exporting the data to a CSV file
26
+ file_path = folder_path+"/social_media_data.csv"
27
+ df.to_csv(file_path, index=False)
28
+
29
+ df.to_pickle(folder_path+"/social_media_data.pkl")
30
+
31
+ def fetch_social_media_data():
32
+ with open('files/social_media_data.json', 'r') as file:
33
+ data = json.load(file)
34
+ social_media_document = []
35
+ for item in data:
36
+ social_media_document.append(Document(
37
+ page_content=str(item["page_content"]),
38
+ metadata={"platform":item["platform"],
39
+ "company":item["company"],
40
+ "ingestion_timestamp":datetime.now().isoformat(),
41
+ "word_count":len(item["page_content"]["content"])
42
+ }))
43
+ return social_media_document
44
+
45
+ def save_ingested_data(ingested_data):
46
+ # Save the list to a file
47
+ with open('Stock Sentiment Analysis/files/ingested_data.pkl', 'wb') as file:
48
+ pickle.dump(ingested_data, file)
49
+
50
+ def save_analysed_data(analysed_data):
51
+ # Save the list to a file
52
+ with open('Stock Sentiment Analysis/files/analysed_data.pkl', 'wb') as file:
53
+ pickle.dump(analysed_data, file)
54
+
55
+ def get_ingested_data():
56
+ # Load the list from the file
57
+ with open('Stock Sentiment Analysis/files/ingested_data.pkl', 'rb') as file:
58
+ loaded_documents = pickle.load(file)
59
+ return loaded_documents
60
+
61
+ def get_analysed_data():
62
+ # Load the list from the file
63
+ with open('Stock Sentiment Analysis/files/analysed_data.pkl', 'rb') as file:
64
+ loaded_documents = pickle.load(file)
65
+ return loaded_documents
66
+
67
+ def sample_documents(documents: List[Document], n: int) -> List[Document]:
68
+ """
69
+ Samples `n` entries for each unique `"platform"` and `"company"` metadata combination from the input `Document[]`.
70
+
71
+ Args:
72
+ documents (List[Document]): The input list of `Document` objects.
73
+ n (int): The number of entries to sample for each unique metadata combination.
74
+
75
+ Returns:
76
+ List[Document]: A new list of `Document` objects, with `n` entries per unique metadata combination.
77
+ """
78
+ # Create a dictionary to store the sampled documents per metadata combination
79
+ sampled_docs = {}
80
+
81
+ for doc in documents:
82
+ combo = (doc.metadata["platform"], doc.metadata["company"])
83
+ if combo not in sampled_docs:
84
+ sampled_docs[combo] = []
85
+
86
+ # Add the document to the list for its metadata combination, up to n entries
87
+ if len(sampled_docs[combo]) < n:
88
+ sampled_docs[combo].append(doc)
89
+
90
+ # Flatten the dictionary into a single list
91
+ return [doc for docs in sampled_docs.values() for doc in docs]
92
+
93
+ def to_documents(data) -> List[Document]:
94
+ social_media_document = []
95
+ for item in data:
96
+ social_media_document.append(Document(
97
+ page_content=str(item["page_content"]),
98
+ metadata={"platform":item["platform"],
99
+ "company":item["company"],
100
+ "ingestion_timestamp":datetime.now().isoformat(),
101
+ "word_count":len(item["page_content"]["content"])
102
+ }))
103
  return social_media_document