feat: add LLMX configuration with Crawl4AI RAG MCP server

- Add config.toml with MCP servers configuration - Add compose.yaml for PostgreSQL+pgvector, PostgREST, and Crawl4AI RAG - Include forked mcp-crawl4ai-rag with BGE 1024-dim embedding support - Custom schema (crawled_pages_1024.sql) for BGE embeddings 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-25 08:29:43 +01:00
commit 10bcbb2120
23 changed files with 10224 additions and 0 deletions
--- a/servers/mcp-crawl4ai-rag/crawled_pages_1024.sql
+++ b/servers/mcp-crawl4ai-rag/crawled_pages_1024.sql
@@ -0,0 +1,226 @@
+-- Schema for Crawl4AI RAG with BGE embeddings (1024 dimensions)
+-- Modified from original 1536-dim OpenAI schema
+
+-- Enable the pgvector extension
+create extension if not exists vector;
+
+-- Create roles for PostgREST (Supabase-compatible setup)
+DO $$
+BEGIN
+    IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'anon') THEN
+        CREATE ROLE anon NOLOGIN;
+    END IF;
+    IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'authenticated') THEN
+        CREATE ROLE authenticated NOLOGIN;
+    END IF;
+    IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'service_role') THEN
+        CREATE ROLE service_role NOLOGIN BYPASSRLS;
+    END IF;
+END
+$$;
+
+-- Grant schema usage to roles
+GRANT USAGE ON SCHEMA public TO anon, authenticated, service_role;
+
+-- Drop tables if they exist (to allow rerunning the script)
+drop table if exists crawled_pages cascade;
+drop table if exists code_examples cascade;
+drop table if exists sources cascade;
+
+-- Create the sources table
+create table sources (
+    source_id text primary key,
+    summary text,
+    total_word_count integer default 0,
+    created_at timestamp with time zone default timezone('utc'::text, now()) not null,
+    updated_at timestamp with time zone default timezone('utc'::text, now()) not null
+);
+
+-- Create the documentation chunks table
+create table crawled_pages (
+    id bigserial primary key,
+    url varchar not null,
+    chunk_number integer not null,
+    content text not null,
+    metadata jsonb not null default '{}'::jsonb,
+    source_id text not null,
+    embedding vector(1024),  -- BGE embeddings are 1024 dimensions
+    created_at timestamp with time zone default timezone('utc'::text, now()) not null,
+
+    -- Add a unique constraint to prevent duplicate chunks for the same URL
+    unique(url, chunk_number),
+
+    -- Add foreign key constraint to sources table
+    foreign key (source_id) references sources(source_id) ON DELETE CASCADE
+);
+
+-- Create an index for better vector similarity search performance
+create index on crawled_pages using ivfflat (embedding vector_cosine_ops);
+
+-- Create an index on metadata for faster filtering
+create index idx_crawled_pages_metadata on crawled_pages using gin (metadata);
+
+-- Create an index on source_id for faster filtering
+CREATE INDEX idx_crawled_pages_source_id ON crawled_pages (source_id);
+
+-- Create a function to search for documentation chunks
+create or replace function match_crawled_pages (
+  query_embedding vector(1024),
+  match_count int default 10,
+  filter jsonb DEFAULT '{}'::jsonb,
+  source_filter text DEFAULT NULL
+) returns table (
+  id bigint,
+  url varchar,
+  chunk_number integer,
+  content text,
+  metadata jsonb,
+  source_id text,
+  similarity float
+)
+language plpgsql
+as $$
+#variable_conflict use_column
+begin
+  return query
+  select
+    id,
+    url,
+    chunk_number,
+    content,
+    metadata,
+    source_id,
+    1 - (crawled_pages.embedding <=> query_embedding) as similarity
+  from crawled_pages
+  where metadata @> filter
+    AND (source_filter IS NULL OR source_id = source_filter)
+  order by crawled_pages.embedding <=> query_embedding
+  limit match_count;
+end;
+$$;
+
+-- Enable RLS on the crawled_pages table
+alter table crawled_pages enable row level security;
+
+-- Create policies for crawled_pages
+create policy "Allow public read access to crawled_pages"
+  on crawled_pages
+  for select
+  to anon, authenticated
+  using (true);
+
+create policy "Allow service_role full access to crawled_pages"
+  on crawled_pages
+  for all
+  to service_role
+  using (true)
+  with check (true);
+
+-- Enable RLS on the sources table
+alter table sources enable row level security;
+
+-- Create policies for sources
+create policy "Allow public read access to sources"
+  on sources
+  for select
+  to anon, authenticated
+  using (true);
+
+create policy "Allow service_role full access to sources"
+  on sources
+  for all
+  to service_role
+  using (true)
+  with check (true);
+
+-- Create the code_examples table
+create table code_examples (
+    id bigserial primary key,
+    url varchar not null,
+    chunk_number integer not null,
+    content text not null,  -- The code example content
+    summary text not null,  -- Summary of the code example
+    metadata jsonb not null default '{}'::jsonb,
+    source_id text not null,
+    embedding vector(1024),  -- BGE embeddings are 1024 dimensions
+    created_at timestamp with time zone default timezone('utc'::text, now()) not null,
+
+    -- Add a unique constraint to prevent duplicate chunks for the same URL
+    unique(url, chunk_number),
+
+    -- Add foreign key constraint to sources table
+    foreign key (source_id) references sources(source_id) ON DELETE CASCADE
+);
+
+-- Create an index for better vector similarity search performance
+create index on code_examples using ivfflat (embedding vector_cosine_ops);
+
+-- Create an index on metadata for faster filtering
+create index idx_code_examples_metadata on code_examples using gin (metadata);
+
+-- Create an index on source_id for faster filtering
+CREATE INDEX idx_code_examples_source_id ON code_examples (source_id);
+
+-- Create a function to search for code examples
+create or replace function match_code_examples (
+  query_embedding vector(1024),
+  match_count int default 10,
+  filter jsonb DEFAULT '{}'::jsonb,
+  source_filter text DEFAULT NULL
+) returns table (
+  id bigint,
+  url varchar,
+  chunk_number integer,
+  content text,
+  summary text,
+  metadata jsonb,
+  source_id text,
+  similarity float
+)
+language plpgsql
+as $$
+#variable_conflict use_column
+begin
+  return query
+  select
+    id,
+    url,
+    chunk_number,
+    content,
+    summary,
+    metadata,
+    source_id,
+    1 - (code_examples.embedding <=> query_embedding) as similarity
+  from code_examples
+  where metadata @> filter
+    AND (source_filter IS NULL OR source_id = source_filter)
+  order by code_examples.embedding <=> query_embedding
+  limit match_count;
+end;
+$$;
+
+-- Enable RLS on the code_examples table
+alter table code_examples enable row level security;
+
+-- Create policies for code_examples
+create policy "Allow public read access to code_examples"
+  on code_examples
+  for select
+  to anon, authenticated
+  using (true);
+
+create policy "Allow service_role full access to code_examples"
+  on code_examples
+  for all
+  to service_role
+  using (true)
+  with check (true);
+
+-- Grant table access to roles
+GRANT SELECT ON sources, crawled_pages, code_examples TO anon, authenticated;
+GRANT ALL ON sources, crawled_pages, code_examples TO service_role;
+GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO anon, authenticated, service_role;
+
+-- Grant function execute permissions
+GRANT EXECUTE ON FUNCTION match_crawled_pages TO anon, authenticated, service_role;
+GRANT EXECUTE ON FUNCTION match_code_examples TO anon, authenticated, service_role;