Vikram Thangaraj commited on
Commit
883aeef
·
1 Parent(s): 5d2f5b2
app.py CHANGED
@@ -1,27 +1,45 @@
1
  import gradio as gr
2
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
3
-
4
- # Load the local Tamil GPT-2 model
5
- tokenizer = AutoTokenizer.from_pretrained("model")
6
- model = AutoModelForCausalLM.from_pretrained("model")
7
-
8
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
9
-
10
- def generate_kavithai(prompt):
11
- if not prompt.strip():
12
- return "தயவுசெய்து ஒரு வரியை உள்ளிடவும்..."
13
- outputs = generator(prompt, max_length=1000, num_return_sequences=1)
14
- return outputs[0]["generated_text"]
15
-
16
- # Gradio UI setup
17
- interface = gr.Interface(
18
- fn=generate_kavithai,
19
- inputs=gr.Textbox(lines=2, placeholder="உங்கள் எண்ணத்தை இங்கே எழுதுங்கள்...", label="📝 உங்கள் வரிகள்"),
20
- outputs=gr.Textbox(label="🎙️ கவிதை வெளியீடு"),
21
- title="தமிழ் கவிதை AI Bot ✍️",
22
- description="தமிழில் கவிதை உருவாக்கும் AI. உங்கள் வார்த்தைகளைப் பகிருங்கள் – ஒரு கவிதையை உருவாக்குவோம்!",
23
- theme="soft"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  )
25
 
26
  if __name__ == "__main__":
27
- interface.launch()
 
1
  import gradio as gr
2
+ from datasets import load_dataset
3
+
4
+ # Load the Tamil Kavithai Tanglish dataset
5
+ hf_dataset = load_dataset("abishekmahi/tamil-kavithai-tanglish", split="train")
6
+ df = hf_dataset.to_pandas()
7
+
8
+ # Ensure columns are strings
9
+ df["TanglishTitle"] = df["TanglishTitle"].astype(str)
10
+ df["TanglishContent"] = df["TanglishContent"].astype(str)
11
+ df["TanglishCategory"] = df["TanglishCategory"].astype(str)
12
+
13
+ # Define search logic
14
+ def search_kavithai(query):
15
+ query = query.lower()
16
+ result = df[df.apply(lambda row:
17
+ query in row["TanglishTitle"].lower() or
18
+ query in row["TanglishContent"].lower() or
19
+ query in row["TanglishCategory"].lower(), axis=1)]
20
+
21
+ if result.empty:
22
+ return "🙏 Sorry, no matching Kavithai found."
23
+
24
+ kavithai = result.iloc[0]
25
+ return f"""
26
+ 🏷️ **Title**: {kavithai['TanglishTitle']}
27
+
28
+ 📜 **Category**: {kavithai['TanglishCategory']}
29
+
30
+ 📝 **Kavithai**:
31
+ {kavithai['TanglishContent']}
32
+ """
33
+
34
+ # Gradio interface
35
+ chat_interface = gr.Interface(
36
+ fn=search_kavithai,
37
+ inputs=gr.Textbox(lines=2, placeholder="Enter a theme, title or word...", label="🔍 Your Query"),
38
+ outputs=gr.Textbox(label="📖 Tamil Kavithai"),
39
+ title="தமிழ் கவிதை Chatbot ✍️",
40
+ description="Search by theme, title or keywords to discover Tamil Kavithai in Tanglish.",
41
+ allow_flagging="never"
42
  )
43
 
44
  if __name__ == "__main__":
45
+ chat_interface.launch()
loadmodel.py DELETED
@@ -1,14 +0,0 @@
1
- from transformers import AutoTokenizer, AutoModelForCausalLM
2
-
3
- # Model name
4
- model_name = "abinayam/gpt-2-tamil"
5
-
6
- # Load from Hugging Face
7
- tokenizer = AutoTokenizer.from_pretrained(model_name)
8
- model = AutoModelForCausalLM.from_pretrained(model_name)
9
-
10
- # Save to local folder called 'model'
11
- model.save_pretrained("model")
12
- tokenizer.save_pretrained("model")
13
-
14
- print("✅ Model and tokenizer saved successfully in './model'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/added_tokens.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "<|endoftext|>": 50265
3
- }
 
 
 
 
model/config.json DELETED
@@ -1,40 +0,0 @@
1
- {
2
- "_name_or_path": "abinayam/gpt-2-tamil",
3
- "activation_function": "gelu_new",
4
- "architectures": [
5
- "GPT2LMHeadModel"
6
- ],
7
- "attn_pdrop": 0.0,
8
- "bos_token_id": 50256,
9
- "embd_pdrop": 0.0,
10
- "eos_token_id": 50256,
11
- "gradient_checkpointing": false,
12
- "initializer_range": 0.02,
13
- "layer_norm_epsilon": 1e-05,
14
- "model_type": "gpt2",
15
- "n_ctx": 1024,
16
- "n_embd": 768,
17
- "n_head": 12,
18
- "n_inner": null,
19
- "n_layer": 12,
20
- "n_positions": 1024,
21
- "reorder_and_upcast_attn": false,
22
- "resid_pdrop": 0.0,
23
- "scale_attn_by_inverse_layer_idx": false,
24
- "scale_attn_weights": true,
25
- "summary_activation": null,
26
- "summary_first_dropout": 0.1,
27
- "summary_proj_to_labels": true,
28
- "summary_type": "cls_index",
29
- "summary_use_proj": true,
30
- "task_specific_params": {
31
- "text-generation": {
32
- "do_sample": true,
33
- "max_length": 300
34
- }
35
- },
36
- "torch_dtype": "float32",
37
- "transformers_version": "4.49.0",
38
- "use_cache": true,
39
- "vocab_size": 50257
40
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/generation_config.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 50256,
4
- "eos_token_id": 50256,
5
- "transformers_version": "4.49.0"
6
- }
 
 
 
 
 
 
 
model/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
model/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed1bed64b8bf4b6e42492c492d6f15af5018557be24fbab7edfefed65cd353d7
3
- size 497774208
 
 
 
 
model/special_tokens_map.json DELETED
@@ -1,5 +0,0 @@
1
- {
2
- "bos_token": "<|endoftext|>",
3
- "eos_token": "<|endoftext|>",
4
- "unk_token": "<|endoftext|>"
5
- }
 
 
 
 
 
 
model/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
model/tokenizer_config.json DELETED
@@ -1,60 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "added_tokens_decoder": {
4
- "0": {
5
- "content": "<s>",
6
- "lstrip": false,
7
- "normalized": false,
8
- "rstrip": false,
9
- "single_word": false,
10
- "special": true
11
- },
12
- "1": {
13
- "content": "<pad>",
14
- "lstrip": false,
15
- "normalized": false,
16
- "rstrip": false,
17
- "single_word": false,
18
- "special": true
19
- },
20
- "2": {
21
- "content": "</s>",
22
- "lstrip": false,
23
- "normalized": false,
24
- "rstrip": false,
25
- "single_word": false,
26
- "special": true
27
- },
28
- "3": {
29
- "content": "<unk>",
30
- "lstrip": false,
31
- "normalized": false,
32
- "rstrip": false,
33
- "single_word": false,
34
- "special": true
35
- },
36
- "4": {
37
- "content": "<mask>",
38
- "lstrip": false,
39
- "normalized": false,
40
- "rstrip": false,
41
- "single_word": false,
42
- "special": true
43
- },
44
- "50265": {
45
- "content": "<|endoftext|>",
46
- "lstrip": false,
47
- "normalized": false,
48
- "rstrip": false,
49
- "single_word": false,
50
- "special": true
51
- }
52
- },
53
- "bos_token": "<|endoftext|>",
54
- "clean_up_tokenization_spaces": false,
55
- "eos_token": "<|endoftext|>",
56
- "extra_special_tokens": {},
57
- "model_max_length": 1000000000000000019884624838656,
58
- "tokenizer_class": "GPT2Tokenizer",
59
- "unk_token": "<|endoftext|>"
60
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/vocab.json DELETED
The diff for this file is too large to render. See raw diff