feat: add LLMX configuration with Crawl4AI RAG MCP server
- Add config.toml with MCP servers configuration - Add compose.yaml for PostgreSQL+pgvector, PostgREST, and Crawl4AI RAG - Include forked mcp-crawl4ai-rag with BGE 1024-dim embedding support - Custom schema (crawled_pages_1024.sql) for BGE embeddings 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
226
servers/mcp-crawl4ai-rag/crawled_pages_1024.sql
Normal file
226
servers/mcp-crawl4ai-rag/crawled_pages_1024.sql
Normal file
@@ -0,0 +1,226 @@
|
||||
-- Schema for Crawl4AI RAG with BGE embeddings (1024 dimensions)
|
||||
-- Modified from original 1536-dim OpenAI schema
|
||||
|
||||
-- Enable the pgvector extension
|
||||
create extension if not exists vector;
|
||||
|
||||
-- Create roles for PostgREST (Supabase-compatible setup)
|
||||
DO $$
|
||||
BEGIN
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'anon') THEN
|
||||
CREATE ROLE anon NOLOGIN;
|
||||
END IF;
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'authenticated') THEN
|
||||
CREATE ROLE authenticated NOLOGIN;
|
||||
END IF;
|
||||
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'service_role') THEN
|
||||
CREATE ROLE service_role NOLOGIN BYPASSRLS;
|
||||
END IF;
|
||||
END
|
||||
$$;
|
||||
|
||||
-- Grant schema usage to roles
|
||||
GRANT USAGE ON SCHEMA public TO anon, authenticated, service_role;
|
||||
|
||||
-- Drop tables if they exist (to allow rerunning the script)
|
||||
drop table if exists crawled_pages cascade;
|
||||
drop table if exists code_examples cascade;
|
||||
drop table if exists sources cascade;
|
||||
|
||||
-- Create the sources table
|
||||
create table sources (
|
||||
source_id text primary key,
|
||||
summary text,
|
||||
total_word_count integer default 0,
|
||||
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
|
||||
updated_at timestamp with time zone default timezone('utc'::text, now()) not null
|
||||
);
|
||||
|
||||
-- Create the documentation chunks table
|
||||
create table crawled_pages (
|
||||
id bigserial primary key,
|
||||
url varchar not null,
|
||||
chunk_number integer not null,
|
||||
content text not null,
|
||||
metadata jsonb not null default '{}'::jsonb,
|
||||
source_id text not null,
|
||||
embedding vector(1024), -- BGE embeddings are 1024 dimensions
|
||||
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
|
||||
|
||||
-- Add a unique constraint to prevent duplicate chunks for the same URL
|
||||
unique(url, chunk_number),
|
||||
|
||||
-- Add foreign key constraint to sources table
|
||||
foreign key (source_id) references sources(source_id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
-- Create an index for better vector similarity search performance
|
||||
create index on crawled_pages using ivfflat (embedding vector_cosine_ops);
|
||||
|
||||
-- Create an index on metadata for faster filtering
|
||||
create index idx_crawled_pages_metadata on crawled_pages using gin (metadata);
|
||||
|
||||
-- Create an index on source_id for faster filtering
|
||||
CREATE INDEX idx_crawled_pages_source_id ON crawled_pages (source_id);
|
||||
|
||||
-- Create a function to search for documentation chunks
|
||||
create or replace function match_crawled_pages (
|
||||
query_embedding vector(1024),
|
||||
match_count int default 10,
|
||||
filter jsonb DEFAULT '{}'::jsonb,
|
||||
source_filter text DEFAULT NULL
|
||||
) returns table (
|
||||
id bigint,
|
||||
url varchar,
|
||||
chunk_number integer,
|
||||
content text,
|
||||
metadata jsonb,
|
||||
source_id text,
|
||||
similarity float
|
||||
)
|
||||
language plpgsql
|
||||
as $$
|
||||
#variable_conflict use_column
|
||||
begin
|
||||
return query
|
||||
select
|
||||
id,
|
||||
url,
|
||||
chunk_number,
|
||||
content,
|
||||
metadata,
|
||||
source_id,
|
||||
1 - (crawled_pages.embedding <=> query_embedding) as similarity
|
||||
from crawled_pages
|
||||
where metadata @> filter
|
||||
AND (source_filter IS NULL OR source_id = source_filter)
|
||||
order by crawled_pages.embedding <=> query_embedding
|
||||
limit match_count;
|
||||
end;
|
||||
$$;
|
||||
|
||||
-- Enable RLS on the crawled_pages table
|
||||
alter table crawled_pages enable row level security;
|
||||
|
||||
-- Create policies for crawled_pages
|
||||
create policy "Allow public read access to crawled_pages"
|
||||
on crawled_pages
|
||||
for select
|
||||
to anon, authenticated
|
||||
using (true);
|
||||
|
||||
create policy "Allow service_role full access to crawled_pages"
|
||||
on crawled_pages
|
||||
for all
|
||||
to service_role
|
||||
using (true)
|
||||
with check (true);
|
||||
|
||||
-- Enable RLS on the sources table
|
||||
alter table sources enable row level security;
|
||||
|
||||
-- Create policies for sources
|
||||
create policy "Allow public read access to sources"
|
||||
on sources
|
||||
for select
|
||||
to anon, authenticated
|
||||
using (true);
|
||||
|
||||
create policy "Allow service_role full access to sources"
|
||||
on sources
|
||||
for all
|
||||
to service_role
|
||||
using (true)
|
||||
with check (true);
|
||||
|
||||
-- Create the code_examples table
|
||||
create table code_examples (
|
||||
id bigserial primary key,
|
||||
url varchar not null,
|
||||
chunk_number integer not null,
|
||||
content text not null, -- The code example content
|
||||
summary text not null, -- Summary of the code example
|
||||
metadata jsonb not null default '{}'::jsonb,
|
||||
source_id text not null,
|
||||
embedding vector(1024), -- BGE embeddings are 1024 dimensions
|
||||
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
|
||||
|
||||
-- Add a unique constraint to prevent duplicate chunks for the same URL
|
||||
unique(url, chunk_number),
|
||||
|
||||
-- Add foreign key constraint to sources table
|
||||
foreign key (source_id) references sources(source_id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
-- Create an index for better vector similarity search performance
|
||||
create index on code_examples using ivfflat (embedding vector_cosine_ops);
|
||||
|
||||
-- Create an index on metadata for faster filtering
|
||||
create index idx_code_examples_metadata on code_examples using gin (metadata);
|
||||
|
||||
-- Create an index on source_id for faster filtering
|
||||
CREATE INDEX idx_code_examples_source_id ON code_examples (source_id);
|
||||
|
||||
-- Create a function to search for code examples
|
||||
create or replace function match_code_examples (
|
||||
query_embedding vector(1024),
|
||||
match_count int default 10,
|
||||
filter jsonb DEFAULT '{}'::jsonb,
|
||||
source_filter text DEFAULT NULL
|
||||
) returns table (
|
||||
id bigint,
|
||||
url varchar,
|
||||
chunk_number integer,
|
||||
content text,
|
||||
summary text,
|
||||
metadata jsonb,
|
||||
source_id text,
|
||||
similarity float
|
||||
)
|
||||
language plpgsql
|
||||
as $$
|
||||
#variable_conflict use_column
|
||||
begin
|
||||
return query
|
||||
select
|
||||
id,
|
||||
url,
|
||||
chunk_number,
|
||||
content,
|
||||
summary,
|
||||
metadata,
|
||||
source_id,
|
||||
1 - (code_examples.embedding <=> query_embedding) as similarity
|
||||
from code_examples
|
||||
where metadata @> filter
|
||||
AND (source_filter IS NULL OR source_id = source_filter)
|
||||
order by code_examples.embedding <=> query_embedding
|
||||
limit match_count;
|
||||
end;
|
||||
$$;
|
||||
|
||||
-- Enable RLS on the code_examples table
|
||||
alter table code_examples enable row level security;
|
||||
|
||||
-- Create policies for code_examples
|
||||
create policy "Allow public read access to code_examples"
|
||||
on code_examples
|
||||
for select
|
||||
to anon, authenticated
|
||||
using (true);
|
||||
|
||||
create policy "Allow service_role full access to code_examples"
|
||||
on code_examples
|
||||
for all
|
||||
to service_role
|
||||
using (true)
|
||||
with check (true);
|
||||
|
||||
-- Grant table access to roles
|
||||
GRANT SELECT ON sources, crawled_pages, code_examples TO anon, authenticated;
|
||||
GRANT ALL ON sources, crawled_pages, code_examples TO service_role;
|
||||
GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO anon, authenticated, service_role;
|
||||
|
||||
-- Grant function execute permissions
|
||||
GRANT EXECUTE ON FUNCTION match_crawled_pages TO anon, authenticated, service_role;
|
||||
GRANT EXECUTE ON FUNCTION match_code_examples TO anon, authenticated, service_role;
|
||||
Reference in New Issue
Block a user