SexBot / milvusDB /prepare_milvus.py
Pew404's picture
Upload folder using huggingface_hub
318db6e verified
from pymilvus import MilvusClient, DataType, FieldSchema, CollectionSchema, Collection
URI = "http://localhost:19530"
def prepare_sex_ed_article_milvus():
client = MilvusClient(uri=URI)
client.drop_collection("t_sur_sex_ed_article_spider")
# Fields
id = FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=1000)
s_title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=2000)
v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
s_chunk = FieldSchema(name="chunk", dtype=DataType.VARCHAR, max_length=2000)
v_chunk = FieldSchema(name="chunk_vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
tags = FieldSchema(name="tags", dtype=DataType.FLOAT_VECTOR, dim=1024)
link = FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=512)
category = FieldSchema(name="category", dtype=DataType.VARCHAR, max_length=128)
# Collection schema
collection_schema = CollectionSchema(
fields=[id, s_title, v_title, s_chunk, v_chunk, tags, link, category],
auto_id=False,
enable_dynamic_field=True,
description="Schema of collection: t_sur_sex_ed_article_spider"
)
# indexs
index_params = client.prepare_index_params()
index_params.add_index(
field_name="title_vector",
index_type="IVF_FLAT",
metric_type="COSINE",
params={"nlist": 128}
)
index_params.add_index(
field_name="chunk_vector",
index_type="IVF_FLAT",
metric_type="COSINE",
params={"nlist": 128}
)
index_params.add_index(
field_name="tags",
index_type="IVF_FLAT",
metric_type="COSINE",
params={"nlist": 128}
)
# create collection
client.create_collection(
collection_name="t_sur_sex_ed_article_spider",
schema=collection_schema,
index_params=index_params
)
status = client.get_load_state("t_sur_sex_ed_article_spider")
print(f"t_sur_sex_ed_article_spider:{status}")
def prepare_sex_ed_qa_milvus():
client = MilvusClient(uri=URI)
client.drop_collection("t_sur_sex_ed_question_answer_spider")
# Fields
id = FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=1000)
url = FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=1000)
title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=2000)
v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
content = FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=2000)
v_content = FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
content_type = FieldSchema(name="content_type", dtype=DataType.VARCHAR, max_length=8)
author = FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=64)
avatar_url = FieldSchema(name="avatar_url", dtype=DataType.VARCHAR, max_length=1024)
likes = FieldSchema(name="likes", dtype=DataType.INT32)
dislikes = FieldSchema(name="dislikes", dtype=DataType.INT32)
# Collection schema
collection_schema = CollectionSchema(
fields=[id, url, title, v_title, content, v_content, content_type, author, avatar_url, likes, dislikes],
auto_id=False,
enable_dynamic_fields=True,
description="Sex Education QA"
)
# indexs
index_params = client.prepare_index_params()
index_params.add_index(
field_name="title_vector",
index_type="IVF_FLAT",
metric_type="COSINE",
params={"nlist": 128}
)
index_params.add_index(
field_name="content_vector",
index_type="IVF_FLAT",
metric_type="COSINE",
params={"nlist": 128}
)
# create collection
client.create_collection(
collection_name="t_sur_sex_ed_question_answer_spider",
schema=collection_schema,
index_params=index_params
)
status = client.get_load_state("t_sur_sex_ed_question_answer_spider")
print(f"t_sur_sex_ed_question_answer_spider:{status}")
def prepare_sex_ed_youtube():
client = MilvusClient(uri=URI)
client.drop_collection("t_sur_sex_ed_youtube_spider")
# Fields
id = FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=1000)
link = FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=512)
title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=256)
v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
views = FieldSchema(name="views", dtype=DataType.VARCHAR, max_length=64)
author = FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=64)
picture = FieldSchema(name="picture", dtype=DataType.VARCHAR, max_length=512)
likes = FieldSchema(name="likes", dtype=DataType.VARCHAR, max_length=64)
duration = FieldSchema(name="duration", dtype=DataType.VARCHAR, max_length=64)
tag = FieldSchema(name="tag", dtype=DataType.VARCHAR, max_length=64)
v_tag = FieldSchema(name="tag_vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
delete_status = FieldSchema(name="delete_status", dtype=DataType.INT8)
# Collection schema
collection_schema = CollectionSchema(
fields=[id, link, title, v_title, views, author, picture, likes, duration, tag, v_tag, delete_status],
auto_id=False,
enable_dynamic_fields=True,
description="Sex Education videos collection"
)
# indexs
index_params = client.prepare_index_params()
index_params.add_index(
field_name="title_vector",
index_type="IVF_FLAT",
metric_type="COSINE",
params={"nlist": 128}
)
index_params.add_index(
field_name="tag_vector",
index_type="IVF_FLAT",
metric_type="COSINE",
params={"nlist": 128}
)
# create collection
client.create_collection(
collection_name="t_sur_sex_ed_youtube_spider",
schema=collection_schema,
index_params=index_params
)
status = client.get_load_state(f"t_sur_sex_ed_youtube_spider")
print(f"t_sur_sex_ed_youtube_spider:{status}")
def prepare_pornVideo():
client = MilvusClient(uri=URI)
client.drop_collection("t_sur_video")
# Fields
url = FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=256, is_primary=True)
duration = FieldSchema(name="duration", dtype=DataType.INT64)
viewCount = FieldSchema(name="viewCount", dtype=DataType.INT64)
cover_picture = FieldSchema(name="coverPicture", dtype=DataType.VARCHAR, max_length=1024)
title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512)
v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
uploader = FieldSchema(name="uploader", dtype=DataType.VARCHAR, max_length=256)
categories = FieldSchema(name="categories", dtype=DataType.VARCHAR, max_length=1024)
v_categories = FieldSchema(name="categories_vector", dtype=DataType.FLOAT_VECTOR, dim=1024)
resource_type = FieldSchema(name="resourceType", dtype=DataType.INT8)
sexual_preference = FieldSchema(name="sexualPreference", dtype=DataType.INT8)
# Collection Schema
collection_schema = CollectionSchema(
fields=[url, duration, viewCount, cover_picture, title, v_title, uploader, categories, v_categories, resource_type, sexual_preference],
auto_id=False,
enable_dynamic_fields=True,
description="Sexual Education Videos"
)
#indexs
index_params = client.prepare_index_params()
index_params.add_index(
field_name="title_vector",
index_type="IVF_FLAT",
metric_type="COSINE",
params={"nlist": 128}
)
index_params.add_index(
field_name="categories_vector",
index_type="IVF_FLAT",
metric_type="COSINE",
params={"nlist": 128}
)
# create collection
client.create_collection(
collection_name="t_sur_video",
schema=collection_schema,
index_params=index_params
)
status = client.get_load_state("t_sur_video")
print(f"t_sur_video:{status}")
if __name__ == '__main__':
prepare_sex_ed_article_milvus()
prepare_sex_ed_qa_milvus()
prepare_sex_ed_youtube()
prepare_pornVideo()