In [None]:
import json
import pandas as pd

# LOCATION OF THE OSM DATA FOR FINE-TUNING
data = 'tutorial_datasets/osm_mn.csv'


In [None]:
## CONSTRUCT DATASET FOR FINE TUNING ##

# Read data from .csv data file

state_frame = pd.read_csv(data)


# construct list of names and coordinates from data
name_list = []
coordinate_list = []
for i, item in state_frame.iterrows():
 name = item[1]
 lat = item[2]
 lng =item[3]
 name_list.append(name)
 coordinate_list.append([lng,lat])


# construct KDTree out of coordinates list for when we make the neighbor lists
import scipy.spatial as scp

ordered_neighbor_coordinate_list = scp.KDTree(coordinate_list)

In [None]:
state_frame

In [None]:

# Get top 20 nearest neighbors for each entity in dataset
with open('tutorial_datasets/SPABERT_finetuning_data.json', 'w') as out_f:
 for i, item in state_frame.iterrows():
 name = item[1]
 lat = item[2]
 lng = item[3]
 coordinates = [lng,lat]

 _, nearest_neighbors_idx = ordered_neighbor_coordinate_list.query([coordinates], k=21)

 # we want to store their names and coordinates

 nearest_neighbors_name = []
 nearest_neighbors_coords = []
 
 # iterate over nearest neighbors list
 for idx in nearest_neighbors_idx[0]:
 # get name and coordinate of neighbor
 neighbor_name = name_list[idx]
 neighbor_coords = coordinate_list[idx]
 nearest_neighbors_name.append(neighbor_name)
 nearest_neighbors_coords.append({"coordinates": neighbor_coords})
 
 # construct neighbor info dictionary object for SpaBERT embedding construction
 neighbor_info = {"name_list":nearest_neighbors_name, "geometry_list":nearest_neighbors_coords}


 # construct full dictionary object for SpaBERT embedding construction
 place = {"info":{"name":name, "geometry":{"coordinates": coordinates}}, "neighbor_info":neighbor_info}

 out_f.write(json.dumps(place))
 out_f.write('\n')

In [None]:
### FINE-TUNE SPABERT
import sys
from transformers.models.bert.modeling_bert import BertForMaskedLM
from transformers import BertTokenizer
sys.path.append("../")
from models.spatial_bert_model import SpatialBertConfig
from utils.common_utils import load_spatial_bert_pretrained_weights
from models.spatial_bert_model import SpatialBertForMaskedLM

# load dataset we just created

dataset = 'tutorial_datasets/SPABERT_finetuning_data.json'

# load pre-trained spabert model

pretrained_model = 'tutorial_datasets/mlm_mem_keeppos_ep0_iter06000_0.2936.pth'


# load bert model and tokenizer as well as the SpaBERT config
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
config = SpatialBertConfig()

In [None]:
# load pre-trained spabert model
import torch
model = SpatialBertForMaskedLM(config)

model.load_state_dict(bert_model.state_dict() , strict = False) 

pre_trained_model = torch.load(pretrained_model)

model_keys = model.state_dict()
cnt_layers = 0
for key in model_keys:
 if key in pre_trained_model:
 model_keys[key] = pre_trained_model[key]
 cnt_layers += 1
 else:
 print("No weight for", key)
print(cnt_layers, 'layers loaded')

model.load_state_dict(model_keys)

In [None]:
from datasets.osm_sample_loader import PbfMapDataset
from torch.utils.data import DataLoader
# load fine-tning dataset with data loader

fine_tune_dataset = PbfMapDataset(data_file_path = dataset, 
 tokenizer = tokenizer, 
 max_token_len = 300, 
 distance_norm_factor = 0.0001, 
 spatial_dist_fill = 20, 
 with_type = False,
 sep_between_neighbors = False, 
 label_encoder = None,
 mode = None)
#initialize data loader
train_loader = DataLoader(fine_tune_dataset, batch_size=12, num_workers=5, shuffle=False, pin_memory=True, drop_last=True)



In [None]:
import torch
# cast our loaded model to a gpu if one is available, otherwise use the cpu
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# set model to training mode
model.train()

In [None]:
### FINE TUNING PROCEDURE ###
from tqdm import tqdm 
from transformers import AdamW
# initialize optimizer
optim = AdamW(model.parameters(), lr = 5e-5)

# setup loop with TQDM and dataloader
epoch = tqdm(train_loader, leave=True)
iter = 0
for batch in epoch:
 # initialize calculated gradients from previous step
 optim.zero_grad()

 # pull all tensor batches required for training
 input_ids = batch['masked_input'].to(device)
 attention_mask = batch['attention_mask'].to(device)
 position_list_x = batch['norm_lng_list'].to(device)
 position_list_y = batch['norm_lat_list'].to(device)
 sent_position_ids = batch['sent_position_ids'].to(device)

 labels = batch['pseudo_sentence'].to(device)

 # get outputs of model
 outputs = model(input_ids, attention_mask = attention_mask, sent_position_ids = sent_position_ids,
 position_list_x = position_list_x, position_list_y = position_list_y, labels = labels)
 

 # calculate loss
 loss = outputs.loss

 # perform backpropigation
 loss.backward()

 optim.step()
 epoch.set_postfix({'loss':loss.item()})


 iter += 1
torch.save(model.state_dict(), "tutorial_datasets/fine-spabert-base-uncased-finetuned-osm-mn.pth")
