from pymilvus import MilvusClient, DataType, FieldSchema, CollectionSchema, Collection URI = "http://localhost:19530" def prepare_sex_ed_article_milvus(): client = MilvusClient(uri=URI) client.drop_collection("t_sur_sex_ed_article_spider") # Fields id = FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=1000) s_title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=2000) v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) s_chunk = FieldSchema(name="chunk", dtype=DataType.VARCHAR, max_length=2000) v_chunk = FieldSchema(name="chunk_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) tags = FieldSchema(name="tags", dtype=DataType.FLOAT_VECTOR, dim=1024) link = FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=512) category = FieldSchema(name="category", dtype=DataType.VARCHAR, max_length=128) # Collection schema collection_schema = CollectionSchema( fields=[id, s_title, v_title, s_chunk, v_chunk, tags, link, category], auto_id=False, enable_dynamic_field=True, description="Schema of collection: t_sur_sex_ed_article_spider" ) # indexs index_params = client.prepare_index_params() index_params.add_index( field_name="title_vector", index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128} ) index_params.add_index( field_name="chunk_vector", index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128} ) index_params.add_index( field_name="tags", index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128} ) # create collection client.create_collection( collection_name="t_sur_sex_ed_article_spider", schema=collection_schema, index_params=index_params ) status = client.get_load_state("t_sur_sex_ed_article_spider") print(f"t_sur_sex_ed_article_spider:{status}") def prepare_sex_ed_qa_milvus(): client = MilvusClient(uri=URI) client.drop_collection("t_sur_sex_ed_question_answer_spider") # Fields id = FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=1000) url = FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=1000) title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=2000) v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) content = FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=2000) v_content = FieldSchema(name="content_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) content_type = FieldSchema(name="content_type", dtype=DataType.VARCHAR, max_length=8) author = FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=64) avatar_url = FieldSchema(name="avatar_url", dtype=DataType.VARCHAR, max_length=1024) likes = FieldSchema(name="likes", dtype=DataType.INT32) dislikes = FieldSchema(name="dislikes", dtype=DataType.INT32) # Collection schema collection_schema = CollectionSchema( fields=[id, url, title, v_title, content, v_content, content_type, author, avatar_url, likes, dislikes], auto_id=False, enable_dynamic_fields=True, description="Sex Education QA" ) # indexs index_params = client.prepare_index_params() index_params.add_index( field_name="title_vector", index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128} ) index_params.add_index( field_name="content_vector", index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128} ) # create collection client.create_collection( collection_name="t_sur_sex_ed_question_answer_spider", schema=collection_schema, index_params=index_params ) status = client.get_load_state("t_sur_sex_ed_question_answer_spider") print(f"t_sur_sex_ed_question_answer_spider:{status}") def prepare_sex_ed_youtube(): client = MilvusClient(uri=URI) client.drop_collection("t_sur_sex_ed_youtube_spider") # Fields id = FieldSchema(name="id", dtype=DataType.VARCHAR, is_primary=True, max_length=1000) link = FieldSchema(name="link", dtype=DataType.VARCHAR, max_length=512) title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=256) v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) views = FieldSchema(name="views", dtype=DataType.VARCHAR, max_length=64) author = FieldSchema(name="author", dtype=DataType.VARCHAR, max_length=64) picture = FieldSchema(name="picture", dtype=DataType.VARCHAR, max_length=512) likes = FieldSchema(name="likes", dtype=DataType.VARCHAR, max_length=64) duration = FieldSchema(name="duration", dtype=DataType.VARCHAR, max_length=64) tag = FieldSchema(name="tag", dtype=DataType.VARCHAR, max_length=64) v_tag = FieldSchema(name="tag_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) delete_status = FieldSchema(name="delete_status", dtype=DataType.INT8) # Collection schema collection_schema = CollectionSchema( fields=[id, link, title, v_title, views, author, picture, likes, duration, tag, v_tag, delete_status], auto_id=False, enable_dynamic_fields=True, description="Sex Education videos collection" ) # indexs index_params = client.prepare_index_params() index_params.add_index( field_name="title_vector", index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128} ) index_params.add_index( field_name="tag_vector", index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128} ) # create collection client.create_collection( collection_name="t_sur_sex_ed_youtube_spider", schema=collection_schema, index_params=index_params ) status = client.get_load_state(f"t_sur_sex_ed_youtube_spider") print(f"t_sur_sex_ed_youtube_spider:{status}") def prepare_pornVideo(): client = MilvusClient(uri=URI) client.drop_collection("t_sur_video") # Fields url = FieldSchema(name="url", dtype=DataType.VARCHAR, max_length=256, is_primary=True) duration = FieldSchema(name="duration", dtype=DataType.INT64) viewCount = FieldSchema(name="viewCount", dtype=DataType.INT64) cover_picture = FieldSchema(name="coverPicture", dtype=DataType.VARCHAR, max_length=1024) title = FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=512) v_title = FieldSchema(name="title_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) uploader = FieldSchema(name="uploader", dtype=DataType.VARCHAR, max_length=256) categories = FieldSchema(name="categories", dtype=DataType.VARCHAR, max_length=1024) v_categories = FieldSchema(name="categories_vector", dtype=DataType.FLOAT_VECTOR, dim=1024) resource_type = FieldSchema(name="resourceType", dtype=DataType.INT8) sexual_preference = FieldSchema(name="sexualPreference", dtype=DataType.INT8) # Collection Schema collection_schema = CollectionSchema( fields=[url, duration, viewCount, cover_picture, title, v_title, uploader, categories, v_categories, resource_type, sexual_preference], auto_id=False, enable_dynamic_fields=True, description="Sexual Education Videos" ) #indexs index_params = client.prepare_index_params() index_params.add_index( field_name="title_vector", index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128} ) index_params.add_index( field_name="categories_vector", index_type="IVF_FLAT", metric_type="COSINE", params={"nlist": 128} ) # create collection client.create_collection( collection_name="t_sur_video", schema=collection_schema, index_params=index_params ) status = client.get_load_state("t_sur_video") print(f"t_sur_video:{status}") if __name__ == '__main__': prepare_sex_ed_article_milvus() prepare_sex_ed_qa_milvus() prepare_sex_ed_youtube() prepare_pornVideo()