l_ai_knowledge/migrations/paradedb/01-migrate-to-paradedb.sql

70 lines
3.2 KiB
SQL
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

-- 迁移脚本从PostgreSQL迁移到ParadeDB
-- 注意:在执行此脚本前,请确保已经备份了数据
-- 1. 导出数据在PostgreSQL中执行
-- pg_dump -U postgres -h localhost -p 5432 -d your_database > backup.sql
-- 2. 导入数据在ParadeDB中执行
-- psql -U postgres -h localhost -p 5432 -d your_database < backup.sql
-- 3. 验证数据
-- Insert some sample data
-- INSERT INTO tenants (id, name, description, status, api_key)
-- VALUES
-- (1, 'Demo Tenant', 'This is a demo tenant for testing', 'active', 'sk-00000001abcdefg123456')
-- ON CONFLICT DO NOTHING;
-- SELECT setval('tenants_id_seq', (SELECT MAX(id) FROM tenants));
-- -- Create knowledge base
-- INSERT INTO knowledge_bases (id, name, description, tenant_id, chunking_config, image_processing_config, embedding_model_id)
-- VALUES
-- ('kb-00000001', 'Default Knowledge Base', 'Default knowledge base for testing', 1, '{"chunk_size": 512, "chunk_overlap": 50, "separators": ["\n\n", "\n", "。"], "keep_separator": true}', '{"enable_multimodal": false, "model_id": ""}', 'model-embedding-00000001'),
-- ('kb-00000002', 'Test Knowledge Base', 'Test knowledge base for development', 1, '{"chunk_size": 512, "chunk_overlap": 50, "separators": ["\n\n", "\n", "。"], "keep_separator": true}', '{"enable_multimodal": false, "model_id": ""}', 'model-embedding-00000001'),
-- ('kb-00000003', 'Test Knowledge Base 2', 'Test knowledge base for development 2', 1, '{"chunk_size": 512, "chunk_overlap": 50, "separators": ["\n\n", "\n", "。"], "keep_separator": true}', '{"enable_multimodal": false, "model_id": ""}', 'model-embedding-00000001')
-- ON CONFLICT DO NOTHING;
SELECT COUNT(*) FROM tenants;
SELECT COUNT(*) FROM models;
SELECT COUNT(*) FROM knowledge_bases;
SELECT COUNT(*) FROM knowledges;
-- 测试中文全文搜索
-- 创建文档表
CREATE TABLE chinese_documents (
id SERIAL PRIMARY KEY,
title TEXT,
content TEXT,
published_date DATE
);
-- 在表上创建 BM25 索引,使用结巴分词器支持中文
CREATE INDEX idx_documents_bm25 ON chinese_documents
USING bm25 (id, content)
WITH (
key_field = 'id',
text_fields = '{
"content": {
"tokenizer": {"type": "chinese_lindera"}
}
}'
);
INSERT INTO chinese_documents (title, content, published_date)
VALUES
('人工智能的发展', '人工智能技术正在快速发展,影响了我们生活的方方面面。大语言模型是最近的一个重要突破。', '2023-01-15'),
('机器学习基础', '机器学习是人工智能的一个重要分支,包括监督学习、无监督学习和强化学习等方法。', '2023-02-20'),
('深度学习应用', '深度学习在图像识别、自然语言处理和语音识别等领域有广泛应用。', '2023-03-10'),
('自然语言处理技术', '自然语言处理允许计算机理解、解释和生成人类语言,是人工智能的核心技术之一。', '2023-04-05'),
('计算机视觉入门', '计算机视觉让机器能够"看到"并理解视觉世界,广泛应用于安防、医疗等领域。', '2023-05-12');
INSERT INTO chinese_documents (title, content, published_date)
VALUES
('hello world', 'hello world', '2023-05-12');