faisaldadkhan13 commited on
Commit
226aae3
·
verified ·
1 Parent(s): a034a51

Upload 7 files

Browse files
Files changed (5) hide show
  1. README.md +1 -21
  2. inference.py +23 -0
  3. requirements.txt +2 -1
  4. save_model.py +5 -5
  5. training/train.py +45 -13
README.md CHANGED
@@ -1,23 +1,3 @@
1
- ---
2
- license: mit
3
- datasets:
4
- - lunaopenlabs/LunaAi-dataset
5
- language:
6
- - en
7
- metrics:
8
- - character
9
- base_model:
10
- - lunaopenlabs/LunaAI
11
- new_version: lunaopenlabs/LunaAI
12
- tags:
13
- - code
14
- - ai
15
- - luna
16
- - openlabs
17
- - open
18
- - source
19
- - text-generation-inference
20
- ---
21
  # Luna AI
22
 
23
  Luna AI is an open-source AI model developed by Luna OpenLabs for text classification tasks. Leveraging the BERT architecture, this model is designed to classify text into predefined categories efficiently and accurately.
@@ -115,4 +95,4 @@ Open a pull request.
115
  This project is licensed under the MIT License. See the LICENSE file for details.
116
 
117
  ### Contact
118
- For questions, suggestions, or feedback, feel free to contact the Luna OpenLabs team at [[email protected]].
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Luna AI
2
 
3
  Luna AI is an open-source AI model developed by Luna OpenLabs for text classification tasks. Leveraging the BERT architecture, this model is designed to classify text into predefined categories efficiently and accurately.
 
95
  This project is licensed under the MIT License. See the LICENSE file for details.
96
 
97
  ### Contact
98
+ For questions, suggestions, or feedback, feel free to contact the Luna OpenLabs team at [[email protected]].
inference.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # inference.py
2
+ import torch
3
+ from transformers import BertTokenizer
4
+ from model.luna_model import LunaAI
5
+
6
+ def predict(text, model_path='./luna_ai_model'):
7
+ model = LunaAI(num_classes=2)
8
+ model.load_state_dict(torch.load(f"{model_path}/pytorch_model.bin"))
9
+ model.eval()
10
+
11
+ tokenizer = BertTokenizer.from_pretrained(model_path)
12
+ encoding = tokenizer.encode_plus(text, return_tensors='pt')
13
+ input_ids, attention_mask = encoding['input_ids'], encoding['attention_mask']
14
+
15
+ with torch.no_grad():
16
+ output = model(input_ids, attention_mask)
17
+ _, prediction = torch.max(output, dim=1)
18
+ return prediction.item()
19
+
20
+ if __name__ == "__main__":
21
+ sample_text = "Sample text to classify"
22
+ prediction = predict(sample_text)
23
+ print(f"Prediction: {prediction}")
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  torch
2
  transformers
3
  datasets
4
- huggingface_hub
 
 
1
  torch
2
  transformers
3
  datasets
4
+ huggingface_hub
5
+ scikit-learn
save_model.py CHANGED
@@ -2,11 +2,11 @@
2
  from model.luna_model import LunaAI
3
  from transformers import BertTokenizer
4
 
5
- def save_model(model):
6
- model.save_pretrained('./luna_ai_model')
7
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
8
- tokenizer.save_pretrained('./luna_ai_model')
9
 
10
  if __name__ == "__main__":
11
- model = LunaAI()
12
- save_model(model)
 
2
  from model.luna_model import LunaAI
3
  from transformers import BertTokenizer
4
 
5
+ def save_model(model, path='./luna_ai_model'):
6
+ model.save_pretrained(path)
7
  tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
8
+ tokenizer.save_pretrained(path)
9
 
10
  if __name__ == "__main__":
11
+ model = LunaAI(num_classes=2) # Adjust num_classes if necessary
12
+ save_model(model)
training/train.py CHANGED
@@ -1,16 +1,18 @@
1
  # training/train.py
 
2
  import pandas as pd
3
- from torch.utils.data import DataLoader, Dataset
4
- from transformers import BertTokenizer
5
- from model.luna_model import LunaAI
6
  import torch
7
  import torch.nn as nn
8
- from transformers import AdamW
 
 
 
9
 
10
  class TextDataset(Dataset):
11
- def __init__(self, csv_file):
12
  self.data = pd.read_csv(csv_file)
13
- self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
14
 
15
  def __len__(self):
16
  return len(self.data)
@@ -23,7 +25,7 @@ class TextDataset(Dataset):
23
  add_special_tokens=True,
24
  return_tensors='pt',
25
  padding='max_length',
26
- max_length=128,
27
  truncation=True,
28
  )
29
  return {
@@ -32,12 +34,35 @@ class TextDataset(Dataset):
32
  'labels': torch.tensor(label, dtype=torch.long),
33
  }
34
 
35
- def train_model(model, dataset):
36
- dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
37
- optimizer = AdamW(model.parameters(), lr=5e-5)
 
 
 
 
 
 
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  model.train()
40
- for epoch in range(3): # Adjust the number of epochs
 
41
  for batch in dataloader:
42
  input_ids = batch['input_ids']
43
  attention_mask = batch['attention_mask']
@@ -50,7 +75,14 @@ def train_model(model, dataset):
50
  optimizer.step()
51
  print(f'Epoch {epoch}, Loss: {loss.item()}')
52
 
 
 
 
 
 
 
53
  if __name__ == "__main__":
54
- dataset = TextDataset('data/dataset.csv')
55
- model = LunaAI()
 
56
  train_model(model, dataset)
 
1
  # training/train.py
2
+ import os
3
  import pandas as pd
 
 
 
4
  import torch
5
  import torch.nn as nn
6
+ from torch.utils.data import DataLoader, Dataset
7
+ from transformers import BertTokenizer, AdamW
8
+ from model.luna_model import LunaAI
9
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
10
 
11
  class TextDataset(Dataset):
12
+ def __init__(self, csv_file, tokenizer, max_length=128):
13
  self.data = pd.read_csv(csv_file)
14
+ self.tokenizer = tokenizer
15
+ self.max_length = max_length
16
 
17
  def __len__(self):
18
  return len(self.data)
 
25
  add_special_tokens=True,
26
  return_tensors='pt',
27
  padding='max_length',
28
+ max_length=self.max_length,
29
  truncation=True,
30
  )
31
  return {
 
34
  'labels': torch.tensor(label, dtype=torch.long),
35
  }
36
 
37
+ def evaluate_model(model, dataloader):
38
+ model.eval()
39
+ predictions, true_labels = [], []
40
+ with torch.no_grad():
41
+ for batch in dataloader:
42
+ outputs = model(batch['input_ids'], batch['attention_mask'])
43
+ _, preds = torch.max(outputs, dim=1)
44
+ predictions.extend(preds.cpu().numpy())
45
+ true_labels.extend(batch['labels'].cpu().numpy())
46
 
47
+ accuracy = accuracy_score(true_labels, predictions)
48
+ precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
49
+ return accuracy, precision, recall, f1
50
+
51
+ def save_checkpoint(epoch, model, optimizer, loss, path="./checkpoints"):
52
+ os.makedirs(path, exist_ok=True)
53
+ torch.save({
54
+ 'epoch': epoch,
55
+ 'model_state_dict': model.state_dict(),
56
+ 'optimizer_state_dict': optimizer.state_dict(),
57
+ 'loss': loss,
58
+ }, os.path.join(path, f"checkpoint_epoch_{epoch}.pth"))
59
+
60
+ def train_model(model, dataset, epochs=3, batch_size=16, learning_rate=5e-5):
61
+ dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
62
+ optimizer = AdamW(model.parameters(), lr=learning_rate)
63
  model.train()
64
+
65
+ for epoch in range(epochs):
66
  for batch in dataloader:
67
  input_ids = batch['input_ids']
68
  attention_mask = batch['attention_mask']
 
75
  optimizer.step()
76
  print(f'Epoch {epoch}, Loss: {loss.item()}')
77
 
78
+ save_checkpoint(epoch, model, optimizer, loss.item())
79
+
80
+ # Optional: Evaluate the model at each epoch end
81
+ accuracy, precision, recall, f1 = evaluate_model(model, dataloader)
82
+ print(f'Epoch {epoch} - Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}')
83
+
84
  if __name__ == "__main__":
85
+ tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
86
+ dataset = TextDataset('data/dataset.csv', tokenizer)
87
+ model = LunaAI(num_classes=2) # Adjust num_classes if necessary
88
  train_model(model, dataset)