File size: 2,775 Bytes
245b7fc
 
7f61b34
245b7fc
 
 
 
93f6be7
 
245b7fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482f94f
245b7fc
 
93f6be7
 
482f94f
 
 
 
 
 
93f6be7
 
482f94f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import gradio as gr
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')



def predict(input_text):
  tokens = tokenizer.encode_plus(input_text, add_special_tokens = False, return_tensors = 'pt')
  input_id_chunks = tokens['input_ids'][0].split(510)
  attention_mask_chunks = tokens['attention_mask'][0].split(510)
  def get_input_ids_and_attention_mask_chunk():
    """
    This function splits the input_ids and attention_mask into chunks of size 'chunksize'. 
    It also adds special tokens (101 for [CLS] and 102 for [SEP]) at the start and end of each chunk.
    If the length of a chunk is less than 'chunksize', it pads the chunk with zeros at the end.
    
    Returns:
        input_id_chunks (List[torch.Tensor]): List of chunked input_ids.
        attention_mask_chunks (List[torch.Tensor]): List of chunked attention_masks.
    """
    chunksize = 512
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    attention_mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))
    
    for i in range(len(input_id_chunks)):
        input_id_chunks[i] = torch.cat([
            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
        ])
        
        attention_mask_chunks[i] = torch.cat([
            torch.tensor([1]), attention_mask_chunks[i], torch.tensor([1])
        ])
        
        pad_length = chunksize - input_id_chunks[i].shape[0]
        
        if pad_length > 0:
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_length)
            ])
            attention_mask_chunks[i] = torch.cat([
                attention_mask_chunks[i], torch.Tensor([0] * pad_length)
            ])
            
    return input_id_chunks, attention_mask_chunks 
  input_id_chunks, attention_mask_chunks = get_input_ids_and_attention_mask_chunk()
  input_ids = torch.stack(input_id_chunks)
  attention_mask = torch.stack(attention_mask_chunks)
  input_dict = {
      'input_ids' : input_ids.long(),
      'attention_mask' : attention_mask.int()
  }
  outputs = model(**input_dict)
  probabilities = torch.nn.functional.softmax(outputs[0], dim = -1 )
  mean_probabilities = probabilities.mean(dim = 0)
  output = torch.argmax(mean_probabilities).item()
  if output==0:
    return "positive"
  elif output==1:
    return "negative"

  elif output==2 :
    return "neutral"  

gradio_app = gr.Interface(
  predict,
  inputs=gr.Textbox(label="Write a text"),
  outputs=gr.Textbox(label="output a text"),
  title="Financial Sentiment Analysis",
  live=True,
  allow_flagging="never",
)

gradio_app.launch()