Spaces:

vikramthangaraj
/

tamilkavithai

Sleeping

Vikram Thangaraj commited on Mar 21

Commit

883aeef

1 Parent(s): 5d2f5b2

update

Files changed (11) hide show

app.py CHANGED Viewed

@@ -1,27 +1,45 @@
 import gradio as gr
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
-# Load the local Tamil GPT-2 model
-tokenizer = AutoTokenizer.from_pretrained("model")
-model = AutoModelForCausalLM.from_pretrained("model")
-generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
-def generate_kavithai(prompt):
-    if not prompt.strip():
-        return "தயவுசெய்து ஒரு வரியை உள்ளிடவும்..."
-    outputs = generator(prompt, max_length=1000, num_return_sequences=1)
-    return outputs[0]["generated_text"]
-# Gradio UI setup
-interface = gr.Interface(
-    fn=generate_kavithai,
-    inputs=gr.Textbox(lines=2, placeholder="உங்கள் எண்ணத்தை இங்கே எழுதுங்கள்...", label="📝 உங்கள் வரிகள்"),
-    outputs=gr.Textbox(label="🎙️ கவிதை வெளியீடு"),
-    title="தமிழ் கவிதை AI Bot ✍️",
-    description="தமிழில் கவிதை உருவாக்கும் AI. உங்கள் வார்த்தைகளைப் பகிருங்கள் – ஒரு கவிதையை உருவாக்குவோம்!",
-    theme="soft"
 )
 if __name__ == "__main__":
-    interface.launch()

 import gradio as gr
+from datasets import load_dataset
+# Load the Tamil Kavithai Tanglish dataset
+hf_dataset = load_dataset("abishekmahi/tamil-kavithai-tanglish", split="train")
+df = hf_dataset.to_pandas()
+# Ensure columns are strings
+df["TanglishTitle"] = df["TanglishTitle"].astype(str)
+df["TanglishContent"] = df["TanglishContent"].astype(str)
+df["TanglishCategory"] = df["TanglishCategory"].astype(str)
+# Define search logic
+def search_kavithai(query):
+    query = query.lower()
+    result = df[df.apply(lambda row:
+        query in row["TanglishTitle"].lower() or
+        query in row["TanglishContent"].lower() or
+        query in row["TanglishCategory"].lower(), axis=1)]
+    if result.empty:
+        return "🙏 Sorry, no matching Kavithai found."
+    kavithai = result.iloc[0]
+    return f"""
+🏷️ **Title**: {kavithai['TanglishTitle']}
+📜 **Category**: {kavithai['TanglishCategory']}
+📝 **Kavithai**:
+{kavithai['TanglishContent']}
+"""
+# Gradio interface
+chat_interface = gr.Interface(
+    fn=search_kavithai,
+    inputs=gr.Textbox(lines=2, placeholder="Enter a theme, title or word...", label="🔍 Your Query"),
+    outputs=gr.Textbox(label="📖 Tamil Kavithai"),
+    title="தமிழ் கவிதை Chatbot ✍️",
+    description="Search by theme, title or keywords to discover Tamil Kavithai in Tanglish.",
+    allow_flagging="never"
 )
 if __name__ == "__main__":
+    chat_interface.launch()

loadmodel.py DELETED Viewed

@@ -1,14 +0,0 @@
-from transformers import AutoTokenizer, AutoModelForCausalLM
-# Model name
-model_name = "abinayam/gpt-2-tamil"
-# Load from Hugging Face
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(model_name)
-# Save to local folder called 'model'
-model.save_pretrained("model")
-tokenizer.save_pretrained("model")
-print("✅ Model and tokenizer saved successfully in './model'")

model/added_tokens.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "<|endoftext|>": 50265
-}

model/config.json DELETED Viewed

@@ -1,40 +0,0 @@
-{
-  "_name_or_path": "abinayam/gpt-2-tamil",
-  "activation_function": "gelu_new",
-  "architectures": [
-    "GPT2LMHeadModel"
-  ],
-  "attn_pdrop": 0.0,
-  "bos_token_id": 50256,
-  "embd_pdrop": 0.0,
-  "eos_token_id": 50256,
-  "gradient_checkpointing": false,
-  "initializer_range": 0.02,
-  "layer_norm_epsilon": 1e-05,
-  "model_type": "gpt2",
-  "n_ctx": 1024,
-  "n_embd": 768,
-  "n_head": 12,
-  "n_inner": null,
-  "n_layer": 12,
-  "n_positions": 1024,
-  "reorder_and_upcast_attn": false,
-  "resid_pdrop": 0.0,
-  "scale_attn_by_inverse_layer_idx": false,
-  "scale_attn_weights": true,
-  "summary_activation": null,
-  "summary_first_dropout": 0.1,
-  "summary_proj_to_labels": true,
-  "summary_type": "cls_index",
-  "summary_use_proj": true,
-  "task_specific_params": {
-    "text-generation": {
-      "do_sample": true,
-      "max_length": 300
-    }
-  },
-  "torch_dtype": "float32",
-  "transformers_version": "4.49.0",
-  "use_cache": true,
-  "vocab_size": 50257
-}

model/generation_config.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 50256,
-  "eos_token_id": 50256,
-  "transformers_version": "4.49.0"
-}

model/merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

model/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ed1bed64b8bf4b6e42492c492d6f15af5018557be24fbab7edfefed65cd353d7
-size 497774208

model/special_tokens_map.json DELETED Viewed

@@ -1,5 +0,0 @@
-{
-  "bos_token": "<|endoftext|>",
-  "eos_token": "<|endoftext|>",
-  "unk_token": "<|endoftext|>"
-}

model/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

model/tokenizer_config.json DELETED Viewed

@@ -1,60 +0,0 @@
-{
-  "add_prefix_space": false,
-  "added_tokens_decoder": {
-    "0": {
-      "content": "<s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "1": {
-      "content": "<pad>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "2": {
-      "content": "</s>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "3": {
-      "content": "<unk>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "4": {
-      "content": "<mask>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "50265": {
-      "content": "<|endoftext|>",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    }
-  },
-  "bos_token": "<|endoftext|>",
-  "clean_up_tokenization_spaces": false,
-  "eos_token": "<|endoftext|>",
-  "extra_special_tokens": {},
-  "model_max_length": 1000000000000000019884624838656,
-  "tokenizer_class": "GPT2Tokenizer",
-  "unk_token": "<|endoftext|>"
-}

model/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff