Akhilgautam30 commited on
Commit
52c6760
·
1 Parent(s): 8243432

added code based on rogelio's changes

Browse files
Files changed (3) hide show
  1. .github/workflows/main.yml +1 -1
  2. main.py +20 -13
  3. model_utils.py +69 -20
.github/workflows/main.yml CHANGED
@@ -16,4 +16,4 @@ jobs:
16
  - name: Push to hub
17
  env:
18
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
- run: git push --force https://akhilgautam2011%40gmail.com:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/Akhilgautam30/personality_assesment main
 
16
  - name: Push to hub
17
  env:
18
  HF_TOKEN: ${{ secrets.HF_TOKEN }}
19
+ run: git push https://akhilgautam2011%40gmail.com:${{ secrets.HF_TOKEN }}@huggingface.co/spaces/Akhilgautam30/personality_assesment main
main.py CHANGED
@@ -1,31 +1,38 @@
1
- # API/main.py
2
 
3
- # main.py (in the root directory)
4
- #test command
5
  import sys
6
  import os
7
- from model_utils import predict_personality
8
  from fastapi import FastAPI
 
9
 
10
  app = FastAPI()
11
 
 
 
 
 
 
 
12
  @app.get("/")
13
  async def root():
14
- return {"message": "Personality Assessment API is running"}
 
 
 
 
 
 
 
 
15
 
16
  @app.get("/predict")
17
  async def predict_personality_get(text: str):
18
- try:
19
- print("--------------------------")
20
- predictions = predict_personality(text)
21
- return {"predictions": predictions}
22
- except NameError:
23
- return {"error": "predict_personality function not available"}
24
-
25
 
26
  if __name__ == "__main__":
27
  import uvicorn
28
- uvicorn.run(app, host="0.0.0.0", port=8000)
29
 
30
 
31
 
 
1
+ # main.py
2
 
 
 
3
  import sys
4
  import os
 
5
  from fastapi import FastAPI
6
+ from model_utils import load_model_and_weights, single_predict
7
 
8
  app = FastAPI()
9
 
10
+ # Load the model and tokenizer
11
+ output_folder = '.' # Adjust this path as needed
12
+ hugging_model = 'roberta-base'
13
+ model = load_model_and_weights(hugging_model, output_folder)
14
+
15
+ # Root path handler for unit test
16
  @app.get("/")
17
  async def root():
18
+ test_text = ("always a problem. My hair is really wet and I should go dry it, but this assignment is what I need to do now. "
19
+ "I almost slept through my eight o clock class, but I somehow made it. Ok this show keeps getting cheezier and cheezier "
20
+ "oh dear. I have to cash a check and deposit it so my check book balances, which is something that needs to be done and "
21
+ "really quickly because I will have to pay extra for all the hot checks I have written- uh oh. My twenty minutes probably "
22
+ "seems shorter because I am a slower typist than most people. PROPNAME is a psycho whore, I hate hate her. Something shocking "
23
+ "happens on this show every 0 seconds. I don't think that Days of our lives is a good show, but I seem to be addicted to it "
24
+ "anyway. PROPNAME is so nice and her and LOCNAME are finally together, but probably not for long because there is")
25
+ predictions = single_predict(model, test_text)
26
+ return {"predictions": predictions}
27
 
28
  @app.get("/predict")
29
  async def predict_personality_get(text: str):
30
+ predictions = single_predict(model, text)
31
+ return {"predictions": predictions}
 
 
 
 
 
32
 
33
  if __name__ == "__main__":
34
  import uvicorn
35
+ uvicorn.run(app, host="0.0.0.0", port=7860)
36
 
37
 
38
 
model_utils.py CHANGED
@@ -3,21 +3,81 @@
3
  import os
4
  import tensorflow as tf
5
  from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
 
 
 
6
 
7
  # Define the personality trait labels
8
  traits = ['cAGR', 'cCON', 'cEXT', 'cOPN', 'cNEU']
9
 
10
- def load_model_and_weights():
11
- model_name = "roberta-base"
12
- tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  model = TFAutoModelForSequenceClassification.from_pretrained(
14
- model_name,
15
- num_labels=len(traits),
16
- problem_type="multi_label_classification"
17
  )
 
 
 
 
18
 
19
- # Load custom weights
20
- weights_path = os.path.join(os.getcwd(), 'weights-roberta-base.h5')
21
  if os.path.exists(weights_path):
22
  try:
23
  model.load_weights(weights_path)
@@ -28,15 +88,4 @@ def load_model_and_weights():
28
  else:
29
  print(f"Warning: Custom weights file not found at {weights_path}")
30
  print("Using default weights.")
31
-
32
- return tokenizer, model
33
-
34
- # Load the model and tokenizer
35
- tokenizer, model = load_model_and_weights()
36
-
37
- def predict_personality(text):
38
- inputs = tokenizer(text, return_tensors="tf", truncation=True, padding=True, max_length=512)
39
- outputs = model(inputs)
40
- probabilities = tf.nn.sigmoid(outputs.logits)[0] # Using sigmoid for multi-label
41
- predictions = [{"trait": trait, "score": float(prob)} for trait, prob in zip(traits, probabilities)]
42
- return predictions
 
3
  import os
4
  import tensorflow as tf
5
  from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
6
+ import numpy as np
7
+ from nltk.corpus import stopwords
8
+ from keras.preprocessing.text import Tokenizer
9
 
10
  # Define the personality trait labels
11
  traits = ['cAGR', 'cCON', 'cEXT', 'cOPN', 'cNEU']
12
 
13
+ def preprocess(docs):
14
+ stopwrd = set(stopwords.words('english'))
15
+ t = Tokenizer(num_words=20000, filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n')
16
+ t.fit_on_texts(docs)
17
+ encoded_docs = t.texts_to_sequences(docs)
18
+ idx2word = {v: k for k, v in t.word_index.items()}
19
+
20
+ def abbreviation_handler(text):
21
+ ln = text.lower()
22
+ ln = ln.replace(r"'t", " not")
23
+ ln = ln.replace(r"'s", " is")
24
+ ln = ln.replace(r"'ll", " will")
25
+ ln = ln.replace(r"'ve", " have")
26
+ ln = ln.replace(r"'re", " are")
27
+ ln = ln.replace(r"'m", " am")
28
+ ln = ln.replace(r"'", " ")
29
+ return ln
30
+
31
+ def stopwords_handler(text):
32
+ words = text.split()
33
+ new_words = [w for w in words if w not in stopwrd]
34
+ return ' '.join(new_words)
35
+
36
+ def sequence_to_text(listOfSequences):
37
+ tokenized_list = []
38
+ for text in listOfSequences:
39
+ newText = ''
40
+ for num in text:
41
+ newText += idx2word[num] + ' '
42
+ newText = abbreviation_handler(newText)
43
+ newText = stopwords_handler(newText)
44
+ tokenized_list.append(newText)
45
+ return tokenized_list
46
+
47
+ newLists = sequence_to_text(encoded_docs)
48
+ return newLists
49
+
50
+ def tokenize_text(text, hugging_model='roberta-base'):
51
+ clean_text = preprocess(text)
52
+ tokenizer = AutoTokenizer.from_pretrained(hugging_model)
53
+ inputs = tokenizer(clean_text, padding=True, truncation=True, return_tensors='tf')
54
+ x = dict(inputs)
55
+ return x
56
+
57
+ def single_predict(model, text, traits=['cAGR', 'cCON', 'cEXT', 'cOPN', 'cNEU']):
58
+ traits_scores = dict()
59
+ predicted_labels = dict()
60
+ x = tokenize_text([text])
61
+ logits = model.predict(x, verbose=0).logits
62
+ probs = tf.math.sigmoid(logits).numpy()
63
+ predictions = np.where(probs > 0.5, 1, 0)
64
+ for t, s in zip(traits, probs[0]):
65
+ traits_scores[t] = s
66
+ for t, l in zip(traits, predictions[0]):
67
+ predicted_labels[t] = l
68
+ final_dic = {'probability': traits_scores, 'predicted_label': predicted_labels}
69
+ return final_dic
70
+
71
+ def load_model_and_weights(hugging_model='roberta-base', output_folder='.'):
72
  model = TFAutoModelForSequenceClassification.from_pretrained(
73
+ hugging_model, num_labels=len(traits), problem_type="multi_label_classification"
 
 
74
  )
75
+ if len(hugging_model.split('/')) > 1:
76
+ _hugging_model = hugging_model.split('/')[1]
77
+ else:
78
+ _hugging_model = hugging_model.split('/')[0]
79
 
80
+ weights_path = os.path.join(output_folder, f'weights-{_hugging_model}.h5')
 
81
  if os.path.exists(weights_path):
82
  try:
83
  model.load_weights(weights_path)
 
88
  else:
89
  print(f"Warning: Custom weights file not found at {weights_path}")
90
  print("Using default weights.")
91
+ return model