File size: 10,852 Bytes

784a7e2

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Augmentation by parapharsing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Init & Load Seed Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json, openai\n",
    "from tqdm import tqdm "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "DOMAIN = \"drone-planning/\"\n",
    "# DOMAIN = \"clean-up/\"\n",
    "# DOMAIN = \"pick-and-place/\"\n",
    "with open(DOMAIN + \"train_seed.jsonl\") as f:\n",
    "    train_seed = [json.loads(line) for line in f]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eng_seeds = {\n",
    "    seed['natural']: [] for seed in train_seed\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Augmentation Code\n",
    "prompting GPT-3 seems to work the best in this case"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# You need to set your OPENAI API key here\n",
    "# https://beta.openai.com/account/api-keys\n",
    "openai.api_key = \"TO_BE_SET\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalize(sentence):\n",
    "    # captialize first letter and add period at the end if not present\n",
    "    if sentence[0].islower():\n",
    "        sentence = sentence[0].upper() + sentence[1:]\n",
    "    if sentence[-1] != '.':\n",
    "        sentence = sentence + '.'\n",
    "    return sentence\n",
    "\n",
    "def parse_sentences_from_response(response):\n",
    "    lines = response.split('\\n')\n",
    "    # assert len(lines) == 5\n",
    "    assert len(lines) == 10\n",
    "    lines[0] = \"1.\" + lines[0]\n",
    "    paraphrases = []\n",
    "    for idx, line in enumerate(lines):\n",
    "        assert line.startswith(str(idx+1) + '. ')\n",
    "        sentence_start_idx = len(str(idx+1) + '. ')\n",
    "        paraphrases.append(line[sentence_start_idx:])\n",
    "    for paraphrase in paraphrases:\n",
    "        if paraphrase[-1] == ' ':\n",
    "            if paraphrase[-2] == '.':\n",
    "                paraphrase = paraphrase[:-1]\n",
    "            else:\n",
    "                paraphrase = paraphrase[:-2] + '.'\n",
    "    return paraphrases\n",
    "\n",
    "\n",
    "PROMPT = \"\"\"Rephrase the source sentence in 10 different ways. Make the outputs as diverse as possible.\n",
    "\n",
    "Source: \n",
    "SOURCE-TO-BE-PLACED\n",
    "\n",
    "Outputs:\n",
    "1.\"\"\"\n",
    "def rephrase_a_sentence(sentence):\n",
    "    response = openai.Completion.create(\n",
    "        model=\"text-davinci-002\",\n",
    "        prompt=PROMPT.replace(\"SOURCE-TO-BE-PLACED\", normalize(sentence)),\n",
    "        temperature=0.7,\n",
    "        max_tokens=512,\n",
    "        top_p=1,\n",
    "        best_of=1,\n",
    "        frequency_penalty=0.1,\n",
    "        presence_penalty=0\n",
    "        )\n",
    "    output = response['choices'][0]['text']\n",
    "    try:\n",
    "        paraphrases = parse_sentences_from_response(output)\n",
    "    except:\n",
    "        print(\"Error in parsing response\")\n",
    "        print(output)\n",
    "        return output, \"ERROR\"\n",
    "    return parse_sentences_from_response(output)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "O = rephrase_a_sentence(\"Go to the red room or go to the green room to finally go to the blue room.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "O"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Run Augmentation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(eng_seeds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list(eng_seeds.keys())[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def paraphrase_done(eng_seeds):\n",
    "    for eng_seed, extended in tqdm(eng_seeds.items()):\n",
    "        if len(extended) == 0:\n",
    "            return False\n",
    "    return True\n",
    "\n",
    "while not paraphrase_done(eng_seeds):\n",
    "    for eng_seed, extended in tqdm(eng_seeds.items()):\n",
    "        if len(extended) == 0:\n",
    "            extended += rephrase_a_sentence(eng_seed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eng_seeds"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dump as Training Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_seed[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(DOMAIN + \"syn-aug.train.jsonl\", 'w') as f:\n",
    "    for seed in train_seed:\n",
    "        f.write(json.dumps(seed) + '\\n')\n",
    "        for aug_eng in eng_seeds[seed['natural']]:\n",
    "                f.write(json.dumps({\n",
    "                    'natural': aug_eng,\n",
    "                    'canonical': seed['canonical'],\n",
    "                    'formula': seed['formula']\n",
    "                }) + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(DOMAIN + \"syn.train.jsonl\", 'w') as f:\n",
    "    for seed in train_seed:\n",
    "        f.write(json.dumps(seed) + '\\n')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Normalize the natural language form "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if DOMAIN == \"clean-up/\":\n",
    "    # in clean up, golden natural language data comes without period at the end, no capitalization in the beginning\n",
    "    def clean_up_normalize(sentence):\n",
    "        if sentence[0].isupper():\n",
    "            sentence = sentence[0].lower() + sentence[1:]\n",
    "        if sentence[-1] == '.':\n",
    "            sentence = sentence[:-1]\n",
    "        return sentence\n",
    "\n",
    "    buffer = []\n",
    "    with open(DOMAIN + \"syn-aug.train.jsonl\", 'r') as f:\n",
    "        for l in f.readlines():\n",
    "            buffer.append(json.loads(l))\n",
    "    \n",
    "    with open(DOMAIN + \"syn-aug.train.jsonl\", 'w') as f:\n",
    "        for dp in buffer:\n",
    "            f.write(json.dumps({\n",
    "                'natural': clean_up_normalize(dp['natural']),\n",
    "                'canonical': dp['canonical'],\n",
    "                'formula': dp['formula']\n",
    "            }) + '\\n')\n",
    "\n",
    "if DOMAIN == \"pick-and-place/\":\n",
    "    # in pick and place, golden natural language data comes without period at the end, no capitalization in the beginning\n",
    "    def clean_up_normalize(sentence):\n",
    "        if sentence[0].isupper():\n",
    "            sentence = sentence[0].lower() + sentence[1:]\n",
    "        if sentence[-1] == '.':\n",
    "            sentence = sentence[:-1]\n",
    "        return sentence\n",
    "\n",
    "    buffer = []\n",
    "    with open(DOMAIN + \"syn-aug.train.jsonl\", 'r') as f:\n",
    "        for l in f.readlines():\n",
    "            buffer.append(json.loads(l))\n",
    "    \n",
    "    with open(DOMAIN + \"syn-aug.train.jsonl\", 'w') as f:\n",
    "        for dp in buffer:\n",
    "            f.write(json.dumps({\n",
    "                'natural': clean_up_normalize(dp['natural']),\n",
    "                'canonical': dp['canonical'],\n",
    "                'formula': dp['formula']\n",
    "            }) + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "if DOMAIN == \"drone-planning/\":\n",
    "    # in clean up, golden natural language data comes with a \"space + period\" at the end, no capitalization in the beginning\n",
    "    def clean_up_normalize(sentence):\n",
    "        if sentence[0].isupper():\n",
    "            sentence = sentence[0].lower() + sentence[1:]\n",
    "        while sentence[-1] == ' ' or sentence[-1] == '.' or sentence[-1] == '!':\n",
    "            sentence = sentence[:-1]\n",
    "        sentence = sentence + '.'\n",
    "        sentence = sentence.replace('.', ' .')\n",
    "        sentence = sentence.replace(',', ' ,')\n",
    "        return sentence\n",
    "\n",
    "    buffer = []\n",
    "    # with open(DOMAIN + \"syn-aug.train.jsonl\", 'r') as f:\n",
    "    #     for l in f.readlines():\n",
    "    #         buffer.append(json.loads(l))\n",
    "    \n",
    "    # with open(DOMAIN + \"syn-aug.train.jsonl\", 'w') as f:\n",
    "    #     for dp in buffer:\n",
    "    #         f.write(json.dumps({\n",
    "    #             'natural': clean_up_normalize(dp['natural']),\n",
    "    #             'canonical': dp['canonical'],\n",
    "    #             'formula': dp['formula']\n",
    "    #         }) + '\\n')\n",
    "    with open(DOMAIN + \"syn.train.jsonl\", 'r') as f:\n",
    "        for l in f.readlines():\n",
    "            buffer.append(json.loads(l))\n",
    "    \n",
    "    with open(DOMAIN + \"syn.train.jsonl\", 'w') as f:\n",
    "        for dp in buffer:\n",
    "            f.write(json.dumps({\n",
    "                'natural': clean_up_normalize(dp['natural']),\n",
    "                'canonical': dp['canonical'],\n",
    "                'formula': dp['formula']\n",
    "            }) + '\\n')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "GPML",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "75567ad983eac98a78c1e40a895e8d82557b42cf9969286235abec07ddbf9e7d"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}