feat: add LLMX configuration with Crawl4AI RAG MCP server

- Add config.toml with MCP servers configuration
- Add compose.yaml for PostgreSQL+pgvector, PostgREST, and Crawl4AI RAG
- Include forked mcp-crawl4ai-rag with BGE 1024-dim embedding support
- Custom schema (crawled_pages_1024.sql) for BGE embeddings

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-11-25 08:29:43 +01:00
commit 10bcbb2120
23 changed files with 10224 additions and 0 deletions

View File

@@ -0,0 +1,226 @@
-- Schema for Crawl4AI RAG with BGE embeddings (1024 dimensions)
-- Modified from original 1536-dim OpenAI schema
-- Enable the pgvector extension
create extension if not exists vector;
-- Create roles for PostgREST (Supabase-compatible setup)
DO $$
BEGIN
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'anon') THEN
CREATE ROLE anon NOLOGIN;
END IF;
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'authenticated') THEN
CREATE ROLE authenticated NOLOGIN;
END IF;
IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'service_role') THEN
CREATE ROLE service_role NOLOGIN BYPASSRLS;
END IF;
END
$$;
-- Grant schema usage to roles
GRANT USAGE ON SCHEMA public TO anon, authenticated, service_role;
-- Drop tables if they exist (to allow rerunning the script)
drop table if exists crawled_pages cascade;
drop table if exists code_examples cascade;
drop table if exists sources cascade;
-- Create the sources table
create table sources (
source_id text primary key,
summary text,
total_word_count integer default 0,
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
updated_at timestamp with time zone default timezone('utc'::text, now()) not null
);
-- Create the documentation chunks table
create table crawled_pages (
id bigserial primary key,
url varchar not null,
chunk_number integer not null,
content text not null,
metadata jsonb not null default '{}'::jsonb,
source_id text not null,
embedding vector(1024), -- BGE embeddings are 1024 dimensions
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
-- Add a unique constraint to prevent duplicate chunks for the same URL
unique(url, chunk_number),
-- Add foreign key constraint to sources table
foreign key (source_id) references sources(source_id) ON DELETE CASCADE
);
-- Create an index for better vector similarity search performance
create index on crawled_pages using ivfflat (embedding vector_cosine_ops);
-- Create an index on metadata for faster filtering
create index idx_crawled_pages_metadata on crawled_pages using gin (metadata);
-- Create an index on source_id for faster filtering
CREATE INDEX idx_crawled_pages_source_id ON crawled_pages (source_id);
-- Create a function to search for documentation chunks
create or replace function match_crawled_pages (
query_embedding vector(1024),
match_count int default 10,
filter jsonb DEFAULT '{}'::jsonb,
source_filter text DEFAULT NULL
) returns table (
id bigint,
url varchar,
chunk_number integer,
content text,
metadata jsonb,
source_id text,
similarity float
)
language plpgsql
as $$
#variable_conflict use_column
begin
return query
select
id,
url,
chunk_number,
content,
metadata,
source_id,
1 - (crawled_pages.embedding <=> query_embedding) as similarity
from crawled_pages
where metadata @> filter
AND (source_filter IS NULL OR source_id = source_filter)
order by crawled_pages.embedding <=> query_embedding
limit match_count;
end;
$$;
-- Enable RLS on the crawled_pages table
alter table crawled_pages enable row level security;
-- Create policies for crawled_pages
create policy "Allow public read access to crawled_pages"
on crawled_pages
for select
to anon, authenticated
using (true);
create policy "Allow service_role full access to crawled_pages"
on crawled_pages
for all
to service_role
using (true)
with check (true);
-- Enable RLS on the sources table
alter table sources enable row level security;
-- Create policies for sources
create policy "Allow public read access to sources"
on sources
for select
to anon, authenticated
using (true);
create policy "Allow service_role full access to sources"
on sources
for all
to service_role
using (true)
with check (true);
-- Create the code_examples table
create table code_examples (
id bigserial primary key,
url varchar not null,
chunk_number integer not null,
content text not null, -- The code example content
summary text not null, -- Summary of the code example
metadata jsonb not null default '{}'::jsonb,
source_id text not null,
embedding vector(1024), -- BGE embeddings are 1024 dimensions
created_at timestamp with time zone default timezone('utc'::text, now()) not null,
-- Add a unique constraint to prevent duplicate chunks for the same URL
unique(url, chunk_number),
-- Add foreign key constraint to sources table
foreign key (source_id) references sources(source_id) ON DELETE CASCADE
);
-- Create an index for better vector similarity search performance
create index on code_examples using ivfflat (embedding vector_cosine_ops);
-- Create an index on metadata for faster filtering
create index idx_code_examples_metadata on code_examples using gin (metadata);
-- Create an index on source_id for faster filtering
CREATE INDEX idx_code_examples_source_id ON code_examples (source_id);
-- Create a function to search for code examples
create or replace function match_code_examples (
query_embedding vector(1024),
match_count int default 10,
filter jsonb DEFAULT '{}'::jsonb,
source_filter text DEFAULT NULL
) returns table (
id bigint,
url varchar,
chunk_number integer,
content text,
summary text,
metadata jsonb,
source_id text,
similarity float
)
language plpgsql
as $$
#variable_conflict use_column
begin
return query
select
id,
url,
chunk_number,
content,
summary,
metadata,
source_id,
1 - (code_examples.embedding <=> query_embedding) as similarity
from code_examples
where metadata @> filter
AND (source_filter IS NULL OR source_id = source_filter)
order by code_examples.embedding <=> query_embedding
limit match_count;
end;
$$;
-- Enable RLS on the code_examples table
alter table code_examples enable row level security;
-- Create policies for code_examples
create policy "Allow public read access to code_examples"
on code_examples
for select
to anon, authenticated
using (true);
create policy "Allow service_role full access to code_examples"
on code_examples
for all
to service_role
using (true)
with check (true);
-- Grant table access to roles
GRANT SELECT ON sources, crawled_pages, code_examples TO anon, authenticated;
GRANT ALL ON sources, crawled_pages, code_examples TO service_role;
GRANT USAGE, SELECT ON ALL SEQUENCES IN SCHEMA public TO anon, authenticated, service_role;
-- Grant function execute permissions
GRANT EXECUTE ON FUNCTION match_crawled_pages TO anon, authenticated, service_role;
GRANT EXECUTE ON FUNCTION match_code_examples TO anon, authenticated, service_role;