|
from pymilvus import MilvusClient, DataType, FieldSchema, CollectionSchema, Collection |
|
|
|
URI = "http://localhost:19530" |
|
def prepare_sex_ed_article_milvus(): |
|
client = MilvusClient(uri=URI) |
|
client.drop_collection("t_sur_sex_ed_article_spider") |
|
|
|
|
|
id = FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=1000) |
|
s_title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=2000) |
|
v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) |
|
s_chunk = FieldSchema(name="chunk", dtype=DataType.VARCHAR, max_length=2000) |
|
v_chunk = FieldSchema(name="chunk_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) |
|
tags = FieldSchema(name="tags", dtype=DataType.FLOAT_VECTOR, dim=1024) |
|
link = FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=512) |
|
category = FieldSchema(name="category", dtype=DataType.VARCHAR, max_length=128) |
|
|
|
|
|
collection_schema = CollectionSchema( |
|
fields=[id, s_title, v_title, s_chunk, v_chunk, tags, link, category], |
|
auto_id=False, |
|
enable_dynamic_field=True, |
|
description="Schema of collection: t_sur_sex_ed_article_spider" |
|
) |
|
|
|
|
|
index_params = client.prepare_index_params() |
|
index_params.add_index( |
|
field_name="title_vector", |
|
index_type="IVF_FLAT", |
|
metric_type="COSINE", |
|
params={"nlist": 128} |
|
) |
|
index_params.add_index( |
|
field_name="chunk_vector", |
|
index_type="IVF_FLAT", |
|
metric_type="COSINE", |
|
params={"nlist": 128} |
|
) |
|
index_params.add_index( |
|
field_name="tags", |
|
index_type="IVF_FLAT", |
|
metric_type="COSINE", |
|
params={"nlist": 128} |
|
) |
|
|
|
|
|
client.create_collection( |
|
collection_name="t_sur_sex_ed_article_spider", |
|
schema=collection_schema, |
|
index_params=index_params |
|
) |
|
|
|
status = client.get_load_state("t_sur_sex_ed_article_spider") |
|
print(f"t_sur_sex_ed_article_spider:{status}") |
|
|
|
|
|
def prepare_sex_ed_qa_milvus(): |
|
client = MilvusClient(uri=URI) |
|
client.drop_collection("t_sur_sex_ed_question_answer_spider") |
|
|
|
|
|
id = FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=1000) |
|
url = FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=1000) |
|
title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=2000) |
|
v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) |
|
content = FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=2000) |
|
v_content = FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) |
|
content_type = FieldSchema(name="content_type", dtype=DataType.VARCHAR, max_length=8) |
|
author = FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=64) |
|
avatar_url = FieldSchema(name="avatar_url", dtype=DataType.VARCHAR, max_length=1024) |
|
likes = FieldSchema(name="likes", dtype=DataType.INT32) |
|
dislikes = FieldSchema(name="dislikes", dtype=DataType.INT32) |
|
|
|
|
|
collection_schema = CollectionSchema( |
|
fields=[id, url, title, v_title, content, v_content, content_type, author, avatar_url, likes, dislikes], |
|
auto_id=False, |
|
enable_dynamic_fields=True, |
|
description="Sex Education QA" |
|
) |
|
|
|
|
|
index_params = client.prepare_index_params() |
|
index_params.add_index( |
|
field_name="title_vector", |
|
index_type="IVF_FLAT", |
|
metric_type="COSINE", |
|
params={"nlist": 128} |
|
) |
|
index_params.add_index( |
|
field_name="content_vector", |
|
index_type="IVF_FLAT", |
|
metric_type="COSINE", |
|
params={"nlist": 128} |
|
) |
|
|
|
|
|
client.create_collection( |
|
collection_name="t_sur_sex_ed_question_answer_spider", |
|
schema=collection_schema, |
|
index_params=index_params |
|
) |
|
|
|
status = client.get_load_state("t_sur_sex_ed_question_answer_spider") |
|
print(f"t_sur_sex_ed_question_answer_spider:{status}") |
|
|
|
|
|
def prepare_sex_ed_youtube(): |
|
client = MilvusClient(uri=URI) |
|
client.drop_collection("t_sur_sex_ed_youtube_spider") |
|
|
|
|
|
id = FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=1000) |
|
link = FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=512) |
|
title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=256) |
|
v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) |
|
views = FieldSchema(name="views", dtype=DataType.VARCHAR, max_length=64) |
|
author = FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=64) |
|
picture = FieldSchema(name="picture", dtype=DataType.VARCHAR, max_length=512) |
|
likes = FieldSchema(name="likes", dtype=DataType.VARCHAR, max_length=64) |
|
duration = FieldSchema(name="duration", dtype=DataType.VARCHAR, max_length=64) |
|
tag = FieldSchema(name="tag", dtype=DataType.VARCHAR, max_length=64) |
|
v_tag = FieldSchema(name="tag_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) |
|
delete_status = FieldSchema(name="delete_status", dtype=DataType.INT8) |
|
|
|
|
|
collection_schema = CollectionSchema( |
|
fields=[id, link, title, v_title, views, author, picture, likes, duration, tag, v_tag, delete_status], |
|
auto_id=False, |
|
enable_dynamic_fields=True, |
|
description="Sex Education videos collection" |
|
) |
|
|
|
|
|
index_params = client.prepare_index_params() |
|
index_params.add_index( |
|
field_name="title_vector", |
|
index_type="IVF_FLAT", |
|
metric_type="COSINE", |
|
params={"nlist": 128} |
|
) |
|
index_params.add_index( |
|
field_name="tag_vector", |
|
index_type="IVF_FLAT", |
|
metric_type="COSINE", |
|
params={"nlist": 128} |
|
) |
|
|
|
|
|
client.create_collection( |
|
collection_name="t_sur_sex_ed_youtube_spider", |
|
schema=collection_schema, |
|
index_params=index_params |
|
) |
|
|
|
status = client.get_load_state(f"t_sur_sex_ed_youtube_spider") |
|
print(f"t_sur_sex_ed_youtube_spider:{status}") |
|
|
|
|
|
def prepare_pornVideo(): |
|
client = MilvusClient(uri=URI) |
|
client.drop_collection("t_sur_video") |
|
|
|
|
|
url = FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=256, is_primary=True) |
|
duration = FieldSchema(name="duration", dtype=DataType.INT64) |
|
viewCount = FieldSchema(name="viewCount", dtype=DataType.INT64) |
|
cover_picture = FieldSchema(name="coverPicture", dtype=DataType.VARCHAR, max_length=1024) |
|
title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512) |
|
v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) |
|
uploader = FieldSchema(name="uploader", dtype=DataType.VARCHAR, max_length=256) |
|
categories = FieldSchema(name="categories", dtype=DataType.VARCHAR, max_length=1024) |
|
v_categories = FieldSchema(name="categories_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) |
|
resource_type = FieldSchema(name="resourceType", dtype=DataType.INT8) |
|
sexual_preference = FieldSchema(name="sexualPreference", dtype=DataType.INT8) |
|
|
|
|
|
collection_schema = CollectionSchema( |
|
fields=[url, duration, viewCount, cover_picture, title, v_title, uploader, categories, v_categories, resource_type, sexual_preference], |
|
auto_id=False, |
|
enable_dynamic_fields=True, |
|
description="Sexual Education Videos" |
|
) |
|
|
|
|
|
index_params = client.prepare_index_params() |
|
index_params.add_index( |
|
field_name="title_vector", |
|
index_type="IVF_FLAT", |
|
metric_type="COSINE", |
|
params={"nlist": 128} |
|
) |
|
index_params.add_index( |
|
field_name="categories_vector", |
|
index_type="IVF_FLAT", |
|
metric_type="COSINE", |
|
params={"nlist": 128} |
|
) |
|
|
|
|
|
client.create_collection( |
|
collection_name="t_sur_video", |
|
schema=collection_schema, |
|
index_params=index_params |
|
) |
|
|
|
status = client.get_load_state("t_sur_video") |
|
print(f"t_sur_video:{status}") |
|
|
|
if __name__ == '__main__': |
|
prepare_sex_ed_article_milvus() |
|
prepare_sex_ed_qa_milvus() |
|
prepare_sex_ed_youtube() |
|
prepare_pornVideo() |
|
|