File size: 4,297 Bytes
1ce01d1
 
 
d9ea2c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc07482
d9ea2c1
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
from transformers import TapasTokenizer, TFTapasForQuestionAnswering
import pandas as pd

from transformers import TapasTokenizer, TapasForQuestionAnswering
import pandas as pd
import re

p = re.compile('\d+(\.\d+)?')

# Define the questions
queries = [
            "When did Spider-Man: No Way Home	 release?",
            "which Movies have rating 5?"
           ]

def load_model_and_tokenizer():
  """
    Load
  """
  # Load pretrained tokenizer: TAPAS finetuned on WikiTable Questions
  tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")

  # Load pretrained model: TAPAS finetuned on WikiTable Questions
  model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")

  # Return tokenizer and model
  return tokenizer, model


def prepare_inputs(table, queries, tokenizer):
  """
    Convert dictionary into data frame and tokenize inputs given queries.
  """
  # Prepare inputs
  # table = pd.DataFrame.from_dict(data)
  # table = netflix_df[['title', 'release_year', 'rating']].astype('str').head(50)
  table = table.astype('str').head(100)
  inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt")
  
  # Return things
  return table, inputs


def generate_predictions(inputs, model, tokenizer):
  """
    Generate predictions for some tokenized input.
  """
  # Generate model results
  outputs = model(**inputs)

  # Convert logit outputs into predictions for table cells and aggregation operators
  predicted_table_cell_coords, predicted_aggregation_operators = tokenizer.convert_logits_to_predictions(
          inputs,
          outputs.logits.detach(),
          outputs.logits_aggregation.detach()
  )
  
  # Return values
  return predicted_table_cell_coords, predicted_aggregation_operators


def postprocess_predictions(predicted_aggregation_operators, predicted_table_cell_coords, table):
  """
    Compute the predicted operation and nicely structure the answers.
  """
  # Process predicted aggregation operators
  aggregation_operators = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
  aggregation_predictions_string = [aggregation_operators[x] for x in predicted_aggregation_operators]

  # Process predicted table cell coordinates
  answers = []
  for agg, coordinates in zip(predicted_aggregation_operators, predicted_table_cell_coords):
    if len(coordinates) == 1:
      # 1 cell
      answers.append(table.iat[coordinates[0]])
    else:
      # > 1 cell
      cell_values = []
      for coordinate in coordinates:
        cell_values.append(table.iat[coordinate])
      answers.append(", ".join(cell_values))
      
  # Return values
  return aggregation_predictions_string, answers


def show_answers(queries, answers, aggregation_predictions_string):
  """
    Visualize the postprocessed answers.
  """
  agg = {"NONE": lambda x: x, "SUM" : lambda x: sum(x), "AVERAGE": lambda x: (sum(x) / len(x)), "COUNT": lambda x: len(x)}

  for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
    print(query)
    if predicted_agg == "NONE":
      print("Predicted answer: " + answer)
    else:
      if all([not p.match(val) == None for val in answer.split(', ')]):
        # print("Predicted answer: " + predicted_agg + "(" + answer + ") = " + str(agg[predicted_agg](list(map(float, answer.split(','))))))
        return "Predicted answer: " + str(agg[predicted_agg](list(map(float, answer.split(',')))))
      elif predicted_agg == "COUNT":
        # print("Predicted answer: " + predicted_agg + "(" + answer + ") = " + str(agg[predicted_agg](answer.split(','))))
        return "Predicted answer: " + str(agg[predicted_agg](answer.split(',')))
      else:
		return "Predicted answer: " + predicted_agg + " > " + answer
      



def execute_query(query, table):
	
	"""
	  Invoke the TAPAS model.
	"""
	queries = [query]
	tokenizer, model = load_model_and_tokenizer()
	table, inputs = prepare_inputs(table, queries, tokenizer)
	predicted_table_cell_coords, predicted_aggregation_operators = generate_predictions(inputs, model, tokenizer)
	aggregation_predictions_string, answers = postprocess_predictions(predicted_aggregation_operators, predicted_table_cell_coords, table)
	return show_answers(queries, answers, aggregation_predictions_string)