From 15854cc91eafcd73925e35e239b3e6e925207b16 Mon Sep 17 00:00:00 2001 From: "K. Hodges" Date: Sun, 24 May 2026 03:09:13 -0700 Subject: [PATCH] Add heading chunks and SQLite FTS search --- docs/dev/tasks.md | 10 +-- src/chunk.rs | 210 ++++++++++++++++++++++++++++++++++++++++++++++ src/db.rs | 206 +++++++++++++++++++++++++++++++++++++-------- src/main.rs | 43 +++++++--- src/vault.rs | 15 +++- 5 files changed, 431 insertions(+), 53 deletions(-) create mode 100644 src/chunk.rs diff --git a/docs/dev/tasks.md b/docs/dev/tasks.md index b744ccb..b42be62 100644 --- a/docs/dev/tasks.md +++ b/docs/dev/tasks.md @@ -327,7 +327,7 @@ Detect changed notes efficiently. # Phase 4 — Chunking -## [ ] GM-016 — Implement heading-based chunking +## [x] GM-016 — Implement heading-based chunking ### Goals Split notes into useful retrieval units. @@ -344,7 +344,7 @@ Split notes into useful retrieval units. --- -## [ ] GM-017 — Add fallback chunk splitting +## [x] GM-017 — Add fallback chunk splitting ### Goals Handle giant sections safely. @@ -360,7 +360,7 @@ Handle giant sections safely. --- -## [ ] GM-018 — Estimate token counts +## [x] GM-018 — Estimate token counts ### Goals Prepare for LLM context budgeting. @@ -378,7 +378,7 @@ Prepare for LLM context budgeting. # Phase 5 — Search -## [ ] GM-019 — Implement SQLite FTS search +## [x] GM-019 — Implement SQLite FTS search ### Goals Add keyword search. @@ -396,7 +396,7 @@ Add keyword search. --- -## [ ] GM-020 — Implement basic CLI search command +## [x] GM-020 — Implement basic CLI search command ### Goals Expose usable search interface. diff --git a/src/chunk.rs b/src/chunk.rs new file mode 100644 index 0000000..a87ef8b --- /dev/null +++ b/src/chunk.rs @@ -0,0 +1,210 @@ +use serde::Serialize; + +use crate::db::sha256_hex; +use crate::markdown::{MarkdownBlock, MarkdownBlockKind}; + +#[derive(Clone, Debug, Serialize)] +pub struct NoteChunk { + pub index: usize, + pub heading_path: Vec, + pub content: String, + pub chunk_type: ChunkType, + pub start_line: usize, + pub end_line: usize, + pub token_estimate: usize, + pub content_hash: String, +} + +#[derive(Clone, Debug, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum ChunkType { + HeadingSection, + SplitSection, +} + +pub fn build_chunks( + blocks: &[MarkdownBlock], + target_tokens: usize, + overlap_tokens: usize, +) -> Vec { + let mut chunks = Vec::new(); + let mut current: Vec = Vec::new(); + + for block in blocks { + if matches!(block.kind, MarkdownBlockKind::Heading) && !current.is_empty() { + push_section_chunks(&mut chunks, ¤t, target_tokens, overlap_tokens); + current.clear(); + } + current.push(block.clone()); + } + + if !current.is_empty() { + push_section_chunks(&mut chunks, ¤t, target_tokens, overlap_tokens); + } + + for (index, chunk) in chunks.iter_mut().enumerate() { + chunk.index = index; + } + + chunks +} + +fn push_section_chunks( + chunks: &mut Vec, + section: &[MarkdownBlock], + target_tokens: usize, + overlap_tokens: usize, +) { + let text = section_text(section); + if text.trim().is_empty() { + return; + } + + let token_estimate = estimate_tokens(&text); + let heading_path = section + .iter() + .rev() + .find(|block| !block.heading_path.is_empty()) + .map(|block| block.heading_path.clone()) + .unwrap_or_default(); + let start_line = section.first().map_or(1, |block| block.start_line); + let end_line = section.last().map_or(start_line, |block| block.end_line); + + if token_estimate <= target_tokens { + chunks.push(NoteChunk { + index: 0, + heading_path, + content_hash: sha256_hex(&text), + content: text, + chunk_type: ChunkType::HeadingSection, + start_line, + end_line, + token_estimate, + }); + return; + } + + // Big sections get split by rough words first. Good enough for now, easy to inspect later. + for part in split_with_overlap(&text, target_tokens, overlap_tokens) { + let token_estimate = estimate_tokens(&part); + chunks.push(NoteChunk { + index: 0, + heading_path: heading_path.clone(), + content_hash: sha256_hex(&part), + content: part, + chunk_type: ChunkType::SplitSection, + start_line, + end_line, + token_estimate, + }); + } +} + +fn section_text(section: &[MarkdownBlock]) -> String { + section + .iter() + .map(|block| block.text.trim()) + .filter(|text| !text.is_empty()) + .collect::>() + .join("\n\n") +} + +pub fn estimate_tokens(content: &str) -> usize { + let words = content.split_whitespace().count(); + words.max(1) +} + +fn split_with_overlap(content: &str, target_tokens: usize, overlap_tokens: usize) -> Vec { + let words: Vec<_> = content.split_whitespace().collect(); + if words.is_empty() { + return Vec::new(); + } + + let mut parts = Vec::new(); + let mut start = 0; + let step = target_tokens.saturating_sub(overlap_tokens).max(1); + + while start < words.len() { + let end = (start + target_tokens).min(words.len()); + let part = words[start..end].join(" "); + parts.push(trim_to_sentenceish_boundary(part)); + if end == words.len() { + break; + } + start += step; + } + + parts +} + +fn trim_to_sentenceish_boundary(part: String) -> String { + if part.ends_with('.') || part.ends_with('!') || part.ends_with('?') || part.len() < 240 { + return part; + } + + match part.rfind(['.', '!', '?']) { + Some(idx) if idx > part.len() / 2 => part[..=idx].to_string(), + _ => part, + } +} + +pub fn chunk_type_name(kind: &ChunkType) -> &'static str { + match kind { + ChunkType::HeadingSection => "heading_section", + ChunkType::SplitSection => "split_section", + } +} + +#[cfg(test)] +mod tests { + use crate::markdown::{MarkdownBlock, MarkdownBlockKind}; + + use super::{ChunkType, build_chunks}; + + #[test] + fn builds_heading_chunks_in_order() { + let blocks = vec![ + block(MarkdownBlockKind::Heading, "A", 1, vec!["A"]), + block(MarkdownBlockKind::Paragraph, "one", 2, vec!["A"]), + block(MarkdownBlockKind::Heading, "B", 3, vec!["B"]), + block(MarkdownBlockKind::Paragraph, "two", 4, vec!["B"]), + ]; + + let chunks = build_chunks(&blocks, 100, 10); + + assert_eq!(chunks.len(), 2); + assert_eq!(chunks[0].heading_path, vec!["A"]); + assert_eq!(chunks[1].heading_path, vec!["B"]); + assert!(matches!(chunks[0].chunk_type, ChunkType::HeadingSection)); + } + + #[test] + fn splits_large_sections_with_overlap() { + let text = (0..30) + .map(|idx| format!("word{idx}")) + .collect::>() + .join(" "); + let blocks = vec![block(MarkdownBlockKind::Paragraph, &text, 1, vec![])]; + + let chunks = build_chunks(&blocks, 10, 2); + + assert!(chunks.len() > 1); + assert!(chunks.iter().all(|chunk| chunk.token_estimate <= 10)); + assert!(matches!(chunks[1].chunk_type, ChunkType::SplitSection)); + } + + fn block( + kind: MarkdownBlockKind, + text: &str, + line: usize, + heading_path: Vec<&str>, + ) -> MarkdownBlock { + MarkdownBlock { + kind, + text: text.to_string(), + start_line: line, + end_line: line, + heading_path: heading_path.into_iter().map(String::from).collect(), + } + } +} diff --git a/src/db.rs b/src/db.rs index bed5747..43b32ed 100644 --- a/src/db.rs +++ b/src/db.rs @@ -6,13 +6,25 @@ use rusqlite::{Connection, OptionalExtension, params}; use sha2::{Digest, Sha256}; use tracing::debug; -use crate::markdown::MarkdownBlockKind; +use crate::chunk::chunk_type_name; use crate::vault::{IndexWriteSummary, NoteMetadata, VaultIndex}; +const INDEX_VERSION: i64 = 2; + pub struct IndexStore { conn: Connection, } +#[derive(Clone, Debug, serde::Serialize)] +pub struct SearchHit { + pub path: String, + pub title: String, + pub heading_path: String, + pub snippet: String, + pub score: f64, + pub token_estimate: usize, +} + impl IndexStore { pub fn open(path: &Path) -> Result { if let Some(parent) = path.parent() { @@ -34,8 +46,8 @@ impl IndexStore { // This is a rebuildable cache, so changed notes get their child rows replaced in place. for note in &index.notes { summary.notes_seen += 1; - let existing_hash = existing_note_hash(&tx, ¬e.path)?; - if existing_hash.as_deref() == Some(note.content_hash.as_str()) { + let fresh = existing_note_fresh(&tx, ¬e.path, ¬e.content_hash)?; + if fresh { summary.unchanged_notes += 1; debug!(path = %note.path.display(), "skipping unchanged note"); continue; @@ -49,10 +61,51 @@ impl IndexStore { insert_links(&tx, note_id, note, &mut summary)?; } + rebuild_fts_if_empty(&tx)?; tx.commit()?; Ok(summary) } + pub fn search(&self, query: &str, limit: usize) -> Result> { + let fts_query = fts_query(query); + if fts_query.is_empty() { + return Ok(Vec::new()); + } + + let mut stmt = self.conn.prepare( + r#" + SELECT + notes.path, + notes.title, + chunks.heading_path, + snippet(chunks_fts, 0, '[', ']', '...', 18) AS snippet, + bm25(chunks_fts) AS score, + chunks.token_estimate + FROM chunks_fts + JOIN chunks ON chunks.id = chunks_fts.rowid + JOIN notes ON notes.id = chunks.note_id + WHERE chunks_fts MATCH ?1 + ORDER BY score + LIMIT ?2 + "#, + )?; + + let hits = stmt + .query_map(params![fts_query, limit as i64], |row| { + Ok(SearchHit { + path: row.get(0)?, + title: row.get(1)?, + heading_path: row.get(2)?, + snippet: row.get(3)?, + score: -row.get::<_, f64>(4)?, + token_estimate: row.get::<_, i64>(5)? as usize, + }) + })? + .collect::>>()?; + + Ok(hits) + } + fn bootstrap(&self) -> Result<()> { self.conn.execute_batch( r#" @@ -72,6 +125,7 @@ impl IndexStore { modified_unix_secs INTEGER, file_size INTEGER NOT NULL, content_hash TEXT NOT NULL, + index_version INTEGER NOT NULL DEFAULT 2, created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP ); @@ -91,6 +145,14 @@ impl IndexStore { UNIQUE(note_id, chunk_index) ); + CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts + USING fts5( + content, + path UNINDEXED, + title UNINDEXED, + heading_path UNINDEXED + ); + CREATE TABLE IF NOT EXISTS tags ( id INTEGER PRIMARY KEY, name TEXT NOT NULL UNIQUE @@ -116,18 +178,39 @@ impl IndexStore { INSERT OR IGNORE INTO migrations (id, name) VALUES (1, 'initial_metadata_index'); "#, )?; + ensure_notes_index_version(&self.conn)?; Ok(()) } } -fn existing_note_hash(conn: &Connection, path: &Path) -> Result> { - conn.query_row( - "SELECT content_hash FROM notes WHERE path = ?1", - [path_to_db(path)], - |row| row.get(0), - ) - .optional() - .context("failed to read existing note hash") +fn existing_note_fresh(conn: &Connection, path: &Path, content_hash: &str) -> Result { + let existing = conn + .query_row( + "SELECT content_hash, index_version FROM notes WHERE path = ?1", + [path_to_db(path)], + |row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)), + ) + .optional() + .context("failed to read existing note freshness")?; + + Ok(existing.is_some_and(|(hash, version)| hash == content_hash && version == INDEX_VERSION)) +} + +fn ensure_notes_index_version(conn: &Connection) -> Result<()> { + let mut stmt = conn.prepare("PRAGMA table_info(notes)")?; + let columns = stmt + .query_map([], |row| row.get::<_, String>(1))? + .collect::>>()?; + + if !columns.iter().any(|column| column == "index_version") { + conn.execute( + "ALTER TABLE notes ADD COLUMN index_version INTEGER NOT NULL DEFAULT 1", + [], + ) + .context("failed to add notes.index_version")?; + } + + Ok(()) } fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result { @@ -140,15 +223,17 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result { modified_unix_secs, file_size, content_hash, + index_version, updated_at ) - VALUES (?1, ?2, ?3, ?4, ?5, ?6, CURRENT_TIMESTAMP) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, CURRENT_TIMESTAMP) ON CONFLICT(path) DO UPDATE SET filename = excluded.filename, title = excluded.title, modified_unix_secs = excluded.modified_unix_secs, file_size = excluded.file_size, content_hash = excluded.content_hash, + index_version = excluded.index_version, updated_at = CURRENT_TIMESTAMP "#, params![ @@ -158,6 +243,7 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result { note.modified_unix_secs, note.file_size, note.content_hash, + INDEX_VERSION, ], )?; @@ -170,6 +256,7 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result { } fn clear_note_children(conn: &Connection, note_id: i64) -> Result<()> { + delete_note_fts(conn, note_id)?; conn.execute("DELETE FROM chunks WHERE note_id = ?1", [note_id])?; conn.execute("DELETE FROM note_tags WHERE note_id = ?1", [note_id])?; conn.execute("DELETE FROM links WHERE source_note_id = ?1", [note_id])?; @@ -182,8 +269,8 @@ fn insert_chunks( note: &NoteMetadata, summary: &mut IndexWriteSummary, ) -> Result<()> { - for (idx, block) in note.blocks.iter().enumerate() { - if block.text.trim().is_empty() { + for chunk in ¬e.chunks { + if chunk.content.trim().is_empty() { continue; } @@ -204,21 +291,73 @@ fn insert_chunks( "#, params![ note_id, - idx as i64, - block.heading_path.join(" > "), - block.text, - chunk_type(&block.kind), - block.start_line as i64, - block.end_line as i64, - estimate_tokens(&block.text) as i64, - sha256_hex(&block.text), + chunk.index as i64, + chunk.heading_path.join(" > "), + chunk.content, + chunk_type_name(&chunk.chunk_type), + chunk.start_line as i64, + chunk.end_line as i64, + chunk.token_estimate as i64, + chunk.content_hash, ], )?; + let chunk_id = conn.last_insert_rowid(); + insert_chunk_fts(conn, chunk_id, note, chunk)?; summary.chunks_written += 1; } Ok(()) } +fn insert_chunk_fts( + conn: &Connection, + chunk_id: i64, + note: &NoteMetadata, + chunk: &crate::chunk::NoteChunk, +) -> Result<()> { + conn.execute( + "INSERT INTO chunks_fts (rowid, content, path, title, heading_path) VALUES (?1, ?2, ?3, ?4, ?5)", + params![ + chunk_id, + chunk.content, + path_to_db(¬e.path), + note.title, + chunk.heading_path.join(" > "), + ], + )?; + Ok(()) +} + +fn delete_note_fts(conn: &Connection, note_id: i64) -> Result<()> { + let mut stmt = conn.prepare("SELECT id FROM chunks WHERE note_id = ?1")?; + let chunk_ids = stmt + .query_map([note_id], |row| row.get::<_, i64>(0))? + .collect::>>()?; + + for chunk_id in chunk_ids { + conn.execute("DELETE FROM chunks_fts WHERE rowid = ?1", [chunk_id])?; + } + Ok(()) +} + +fn rebuild_fts_if_empty(conn: &Connection) -> Result<()> { + let fts_count: i64 = conn.query_row("SELECT count(*) FROM chunks_fts", [], |row| row.get(0))?; + let chunk_count: i64 = conn.query_row("SELECT count(*) FROM chunks", [], |row| row.get(0))?; + if fts_count > 0 || chunk_count == 0 { + return Ok(()); + } + + conn.execute( + r#" + INSERT INTO chunks_fts (rowid, content, path, title, heading_path) + SELECT chunks.id, chunks.content, notes.path, notes.title, chunks.heading_path + FROM chunks + JOIN notes ON notes.id = chunks.note_id + "#, + [], + )?; + Ok(()) +} + fn insert_tags( conn: &Connection, note_id: i64, @@ -259,19 +398,16 @@ pub fn sha256_hex(content: &str) -> String { format!("{:x}", Sha256::digest(content.as_bytes())) } -fn estimate_tokens(content: &str) -> usize { - content.split_whitespace().count().max(1) -} - -fn chunk_type(kind: &MarkdownBlockKind) -> &'static str { - match kind { - MarkdownBlockKind::Heading => "heading", - MarkdownBlockKind::Paragraph => "paragraph", - MarkdownBlockKind::CodeBlock => "code_block", - MarkdownBlockKind::List => "list", - } -} - fn path_to_db(path: &Path) -> String { PathBuf::from(path).to_string_lossy().replace('\\', "/") } + +fn fts_query(query: &str) -> String { + query + .split_whitespace() + .map(|term| term.trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-')) + .filter(|term| !term.is_empty()) + .map(|term| format!("\"{}\"", term.replace('"', "\"\""))) + .collect::>() + .join(" ") +} diff --git a/src/main.rs b/src/main.rs index cccf282..c2944f4 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,4 @@ +mod chunk; mod cli; mod config; mod db; @@ -11,7 +12,7 @@ use tracing::{debug, info}; use crate::cli::{Cli, Commands, OutputFormat}; use crate::config::Config; -use crate::db::IndexStore; +use crate::db::{IndexStore, SearchHit}; use crate::vault::VaultIndex; fn main() -> Result<()> { @@ -54,21 +55,15 @@ fn main() -> Result<()> { limit, output, } => { - let index = VaultIndex::scan(&config)?; - let results = index.search(&query, limit); + let db_path = ensure_index_cache(&config)?; + let store = IndexStore::open(&db_path)?; + let results = store.search(&query, limit)?; match output { OutputFormat::Text => { if results.is_empty() { println!("No matches."); } - for (position, result) in results.iter().enumerate() { - println!("{}. {}", position + 1, result.note.path.display()); - println!(" title: {}", result.note.title); - if !result.note.headings.is_empty() { - println!(" headings: {}", result.note.headings.join(" > ")); - } - println!(" score: {}", result.score); - } + print_search_results(&results); } OutputFormat::Json => println!("{}", serde_json::to_string_pretty(&results)?), } @@ -105,3 +100,29 @@ fn init_project(config: &Config, force: bool) -> Result<()> { println!("Config: {}", Config::default_path().display()); Ok(()) } + +fn ensure_index_cache(config: &Config) -> Result { + let db_path = config.vault.path.join(&config.database.path); + if db_path.exists() { + return Ok(db_path); + } + + let index = VaultIndex::scan(config)?; + config.create_agent_dirs()?; + let mut store = IndexStore::open(&db_path)?; + store.write_index(&index)?; + Ok(db_path) +} + +fn print_search_results(results: &[SearchHit]) { + for (position, result) in results.iter().enumerate() { + println!("{}. {}", position + 1, result.path); + println!(" title: {}", result.title); + if !result.heading_path.is_empty() { + println!(" heading: {}", result.heading_path); + } + println!(" tokens: {}", result.token_estimate); + println!(" score: {:.4}", result.score); + println!(" {}", result.snippet); + } +} diff --git a/src/vault.rs b/src/vault.rs index 63feeb2..c508a7f 100644 --- a/src/vault.rs +++ b/src/vault.rs @@ -8,6 +8,7 @@ use serde::Serialize; use tracing::{debug, warn}; use walkdir::{DirEntry, WalkDir}; +use crate::chunk::{NoteChunk, build_chunks}; use crate::config::Config; use crate::db::sha256_hex; use crate::markdown::{MarkdownBlock, Wikilink, parse_markdown}; @@ -30,6 +31,7 @@ pub struct NoteMetadata { pub content_hash: String, pub headings: Vec, pub blocks: Vec, + pub chunks: Vec, pub wikilinks: Vec, pub tags: Vec, } @@ -41,6 +43,7 @@ pub struct IndexSummary { pub markdown_files: usize, pub headings: usize, pub blocks: usize, + pub chunks: usize, pub wikilinks: usize, pub tags: usize, pub skipped_dirs: Vec, @@ -99,7 +102,7 @@ impl VaultIndex { continue; } - let note = read_note(entry.path(), &config.vault.path)?; + let note = read_note(entry.path(), &config.vault.path, config)?; debug!( path = %note.path.display(), title = %note.title, @@ -129,6 +132,7 @@ impl VaultIndex { markdown_files: self.markdown_count, headings: self.notes.iter().map(|note| note.headings.len()).sum(), blocks: self.notes.iter().map(|note| note.blocks.len()).sum(), + chunks: self.notes.iter().map(|note| note.chunks.len()).sum(), wikilinks: self.notes.iter().map(|note| note.wikilinks.len()).sum(), tags: self.notes.iter().map(|note| note.tags.len()).sum(), skipped_dirs: self.skipped_dirs.clone(), @@ -236,6 +240,7 @@ impl fmt::Display for IndexSummary { writeln!(f, "Markdown files: {}", self.markdown_files)?; writeln!(f, "Headings parsed: {}", self.headings)?; writeln!(f, "Markdown blocks: {}", self.blocks)?; + writeln!(f, "Chunks: {}", self.chunks)?; writeln!(f, "Wikilinks: {}", self.wikilinks)?; writeln!(f, "Tags: {}", self.tags)?; writeln!(f, "Skipped dirs: {}", self.skipped_dirs.len())?; @@ -248,7 +253,7 @@ impl fmt::Display for IndexSummary { } } -fn read_note(path: &Path, vault_path: &Path) -> Result { +fn read_note(path: &Path, vault_path: &Path, config: &Config) -> Result { let content = fs::read_to_string(path).with_context(|| format!("failed to read {}", path.display()))?; let metadata = @@ -256,6 +261,11 @@ fn read_note(path: &Path, vault_path: &Path) -> Result { let relative_path = path.strip_prefix(vault_path).unwrap_or(path).to_path_buf(); let source_path = relative_path.to_string_lossy().replace('\\', "/"); let parsed = parse_markdown(&source_path, &content); + let chunks = build_chunks( + &parsed.blocks, + config.index.chunk_target_tokens, + config.index.chunk_overlap_tokens, + ); let content_hash = sha256_hex(&content); Ok(NoteMetadata { @@ -275,6 +285,7 @@ fn read_note(path: &Path, vault_path: &Path) -> Result { content_hash, headings: parsed.headings, blocks: parsed.blocks, + chunks, wikilinks: parsed.wikilinks, tags: parsed.tags, })