Add heading chunks and SQLite FTS search

This commit is contained in:
K. Hodges 2026-05-24 03:09:13 -07:00
parent 9fb82b5324
commit 15854cc91e
5 changed files with 431 additions and 53 deletions

View File

@ -327,7 +327,7 @@ Detect changed notes efficiently.
# Phase 4 — Chunking
## [ ] GM-016 — Implement heading-based chunking
## [x] GM-016 — Implement heading-based chunking
### Goals
Split notes into useful retrieval units.
@ -344,7 +344,7 @@ Split notes into useful retrieval units.
---
## [ ] GM-017 — Add fallback chunk splitting
## [x] GM-017 — Add fallback chunk splitting
### Goals
Handle giant sections safely.
@ -360,7 +360,7 @@ Handle giant sections safely.
---
## [ ] GM-018 — Estimate token counts
## [x] GM-018 — Estimate token counts
### Goals
Prepare for LLM context budgeting.
@ -378,7 +378,7 @@ Prepare for LLM context budgeting.
# Phase 5 — Search
## [ ] GM-019 — Implement SQLite FTS search
## [x] GM-019 — Implement SQLite FTS search
### Goals
Add keyword search.
@ -396,7 +396,7 @@ Add keyword search.
---
## [ ] GM-020 — Implement basic CLI search command
## [x] GM-020 — Implement basic CLI search command
### Goals
Expose usable search interface.

210
src/chunk.rs Normal file
View File

@ -0,0 +1,210 @@
use serde::Serialize;
use crate::db::sha256_hex;
use crate::markdown::{MarkdownBlock, MarkdownBlockKind};
#[derive(Clone, Debug, Serialize)]
pub struct NoteChunk {
pub index: usize,
pub heading_path: Vec<String>,
pub content: String,
pub chunk_type: ChunkType,
pub start_line: usize,
pub end_line: usize,
pub token_estimate: usize,
pub content_hash: String,
}
#[derive(Clone, Debug, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum ChunkType {
HeadingSection,
SplitSection,
}
pub fn build_chunks(
blocks: &[MarkdownBlock],
target_tokens: usize,
overlap_tokens: usize,
) -> Vec<NoteChunk> {
let mut chunks = Vec::new();
let mut current: Vec<MarkdownBlock> = Vec::new();
for block in blocks {
if matches!(block.kind, MarkdownBlockKind::Heading) && !current.is_empty() {
push_section_chunks(&mut chunks, &current, target_tokens, overlap_tokens);
current.clear();
}
current.push(block.clone());
}
if !current.is_empty() {
push_section_chunks(&mut chunks, &current, target_tokens, overlap_tokens);
}
for (index, chunk) in chunks.iter_mut().enumerate() {
chunk.index = index;
}
chunks
}
fn push_section_chunks(
chunks: &mut Vec<NoteChunk>,
section: &[MarkdownBlock],
target_tokens: usize,
overlap_tokens: usize,
) {
let text = section_text(section);
if text.trim().is_empty() {
return;
}
let token_estimate = estimate_tokens(&text);
let heading_path = section
.iter()
.rev()
.find(|block| !block.heading_path.is_empty())
.map(|block| block.heading_path.clone())
.unwrap_or_default();
let start_line = section.first().map_or(1, |block| block.start_line);
let end_line = section.last().map_or(start_line, |block| block.end_line);
if token_estimate <= target_tokens {
chunks.push(NoteChunk {
index: 0,
heading_path,
content_hash: sha256_hex(&text),
content: text,
chunk_type: ChunkType::HeadingSection,
start_line,
end_line,
token_estimate,
});
return;
}
// Big sections get split by rough words first. Good enough for now, easy to inspect later.
for part in split_with_overlap(&text, target_tokens, overlap_tokens) {
let token_estimate = estimate_tokens(&part);
chunks.push(NoteChunk {
index: 0,
heading_path: heading_path.clone(),
content_hash: sha256_hex(&part),
content: part,
chunk_type: ChunkType::SplitSection,
start_line,
end_line,
token_estimate,
});
}
}
fn section_text(section: &[MarkdownBlock]) -> String {
section
.iter()
.map(|block| block.text.trim())
.filter(|text| !text.is_empty())
.collect::<Vec<_>>()
.join("\n\n")
}
pub fn estimate_tokens(content: &str) -> usize {
let words = content.split_whitespace().count();
words.max(1)
}
fn split_with_overlap(content: &str, target_tokens: usize, overlap_tokens: usize) -> Vec<String> {
let words: Vec<_> = content.split_whitespace().collect();
if words.is_empty() {
return Vec::new();
}
let mut parts = Vec::new();
let mut start = 0;
let step = target_tokens.saturating_sub(overlap_tokens).max(1);
while start < words.len() {
let end = (start + target_tokens).min(words.len());
let part = words[start..end].join(" ");
parts.push(trim_to_sentenceish_boundary(part));
if end == words.len() {
break;
}
start += step;
}
parts
}
fn trim_to_sentenceish_boundary(part: String) -> String {
if part.ends_with('.') || part.ends_with('!') || part.ends_with('?') || part.len() < 240 {
return part;
}
match part.rfind(['.', '!', '?']) {
Some(idx) if idx > part.len() / 2 => part[..=idx].to_string(),
_ => part,
}
}
pub fn chunk_type_name(kind: &ChunkType) -> &'static str {
match kind {
ChunkType::HeadingSection => "heading_section",
ChunkType::SplitSection => "split_section",
}
}
#[cfg(test)]
mod tests {
use crate::markdown::{MarkdownBlock, MarkdownBlockKind};
use super::{ChunkType, build_chunks};
#[test]
fn builds_heading_chunks_in_order() {
let blocks = vec![
block(MarkdownBlockKind::Heading, "A", 1, vec!["A"]),
block(MarkdownBlockKind::Paragraph, "one", 2, vec!["A"]),
block(MarkdownBlockKind::Heading, "B", 3, vec!["B"]),
block(MarkdownBlockKind::Paragraph, "two", 4, vec!["B"]),
];
let chunks = build_chunks(&blocks, 100, 10);
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].heading_path, vec!["A"]);
assert_eq!(chunks[1].heading_path, vec!["B"]);
assert!(matches!(chunks[0].chunk_type, ChunkType::HeadingSection));
}
#[test]
fn splits_large_sections_with_overlap() {
let text = (0..30)
.map(|idx| format!("word{idx}"))
.collect::<Vec<_>>()
.join(" ");
let blocks = vec![block(MarkdownBlockKind::Paragraph, &text, 1, vec![])];
let chunks = build_chunks(&blocks, 10, 2);
assert!(chunks.len() > 1);
assert!(chunks.iter().all(|chunk| chunk.token_estimate <= 10));
assert!(matches!(chunks[1].chunk_type, ChunkType::SplitSection));
}
fn block(
kind: MarkdownBlockKind,
text: &str,
line: usize,
heading_path: Vec<&str>,
) -> MarkdownBlock {
MarkdownBlock {
kind,
text: text.to_string(),
start_line: line,
end_line: line,
heading_path: heading_path.into_iter().map(String::from).collect(),
}
}
}

206
src/db.rs
View File

@ -6,13 +6,25 @@ use rusqlite::{Connection, OptionalExtension, params};
use sha2::{Digest, Sha256};
use tracing::debug;
use crate::markdown::MarkdownBlockKind;
use crate::chunk::chunk_type_name;
use crate::vault::{IndexWriteSummary, NoteMetadata, VaultIndex};
const INDEX_VERSION: i64 = 2;
pub struct IndexStore {
conn: Connection,
}
#[derive(Clone, Debug, serde::Serialize)]
pub struct SearchHit {
pub path: String,
pub title: String,
pub heading_path: String,
pub snippet: String,
pub score: f64,
pub token_estimate: usize,
}
impl IndexStore {
pub fn open(path: &Path) -> Result<Self> {
if let Some(parent) = path.parent() {
@ -34,8 +46,8 @@ impl IndexStore {
// This is a rebuildable cache, so changed notes get their child rows replaced in place.
for note in &index.notes {
summary.notes_seen += 1;
let existing_hash = existing_note_hash(&tx, &note.path)?;
if existing_hash.as_deref() == Some(note.content_hash.as_str()) {
let fresh = existing_note_fresh(&tx, &note.path, &note.content_hash)?;
if fresh {
summary.unchanged_notes += 1;
debug!(path = %note.path.display(), "skipping unchanged note");
continue;
@ -49,10 +61,51 @@ impl IndexStore {
insert_links(&tx, note_id, note, &mut summary)?;
}
rebuild_fts_if_empty(&tx)?;
tx.commit()?;
Ok(summary)
}
pub fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchHit>> {
let fts_query = fts_query(query);
if fts_query.is_empty() {
return Ok(Vec::new());
}
let mut stmt = self.conn.prepare(
r#"
SELECT
notes.path,
notes.title,
chunks.heading_path,
snippet(chunks_fts, 0, '[', ']', '...', 18) AS snippet,
bm25(chunks_fts) AS score,
chunks.token_estimate
FROM chunks_fts
JOIN chunks ON chunks.id = chunks_fts.rowid
JOIN notes ON notes.id = chunks.note_id
WHERE chunks_fts MATCH ?1
ORDER BY score
LIMIT ?2
"#,
)?;
let hits = stmt
.query_map(params![fts_query, limit as i64], |row| {
Ok(SearchHit {
path: row.get(0)?,
title: row.get(1)?,
heading_path: row.get(2)?,
snippet: row.get(3)?,
score: -row.get::<_, f64>(4)?,
token_estimate: row.get::<_, i64>(5)? as usize,
})
})?
.collect::<rusqlite::Result<Vec<_>>>()?;
Ok(hits)
}
fn bootstrap(&self) -> Result<()> {
self.conn.execute_batch(
r#"
@ -72,6 +125,7 @@ impl IndexStore {
modified_unix_secs INTEGER,
file_size INTEGER NOT NULL,
content_hash TEXT NOT NULL,
index_version INTEGER NOT NULL DEFAULT 2,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
@ -91,6 +145,14 @@ impl IndexStore {
UNIQUE(note_id, chunk_index)
);
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts
USING fts5(
content,
path UNINDEXED,
title UNINDEXED,
heading_path UNINDEXED
);
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE
@ -116,18 +178,39 @@ impl IndexStore {
INSERT OR IGNORE INTO migrations (id, name) VALUES (1, 'initial_metadata_index');
"#,
)?;
ensure_notes_index_version(&self.conn)?;
Ok(())
}
}
fn existing_note_hash(conn: &Connection, path: &Path) -> Result<Option<String>> {
conn.query_row(
"SELECT content_hash FROM notes WHERE path = ?1",
[path_to_db(path)],
|row| row.get(0),
)
.optional()
.context("failed to read existing note hash")
fn existing_note_fresh(conn: &Connection, path: &Path, content_hash: &str) -> Result<bool> {
let existing = conn
.query_row(
"SELECT content_hash, index_version FROM notes WHERE path = ?1",
[path_to_db(path)],
|row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)),
)
.optional()
.context("failed to read existing note freshness")?;
Ok(existing.is_some_and(|(hash, version)| hash == content_hash && version == INDEX_VERSION))
}
fn ensure_notes_index_version(conn: &Connection) -> Result<()> {
let mut stmt = conn.prepare("PRAGMA table_info(notes)")?;
let columns = stmt
.query_map([], |row| row.get::<_, String>(1))?
.collect::<rusqlite::Result<Vec<_>>>()?;
if !columns.iter().any(|column| column == "index_version") {
conn.execute(
"ALTER TABLE notes ADD COLUMN index_version INTEGER NOT NULL DEFAULT 1",
[],
)
.context("failed to add notes.index_version")?;
}
Ok(())
}
fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
@ -140,15 +223,17 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
modified_unix_secs,
file_size,
content_hash,
index_version,
updated_at
)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, CURRENT_TIMESTAMP)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, CURRENT_TIMESTAMP)
ON CONFLICT(path) DO UPDATE SET
filename = excluded.filename,
title = excluded.title,
modified_unix_secs = excluded.modified_unix_secs,
file_size = excluded.file_size,
content_hash = excluded.content_hash,
index_version = excluded.index_version,
updated_at = CURRENT_TIMESTAMP
"#,
params![
@ -158,6 +243,7 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
note.modified_unix_secs,
note.file_size,
note.content_hash,
INDEX_VERSION,
],
)?;
@ -170,6 +256,7 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
}
fn clear_note_children(conn: &Connection, note_id: i64) -> Result<()> {
delete_note_fts(conn, note_id)?;
conn.execute("DELETE FROM chunks WHERE note_id = ?1", [note_id])?;
conn.execute("DELETE FROM note_tags WHERE note_id = ?1", [note_id])?;
conn.execute("DELETE FROM links WHERE source_note_id = ?1", [note_id])?;
@ -182,8 +269,8 @@ fn insert_chunks(
note: &NoteMetadata,
summary: &mut IndexWriteSummary,
) -> Result<()> {
for (idx, block) in note.blocks.iter().enumerate() {
if block.text.trim().is_empty() {
for chunk in &note.chunks {
if chunk.content.trim().is_empty() {
continue;
}
@ -204,21 +291,73 @@ fn insert_chunks(
"#,
params![
note_id,
idx as i64,
block.heading_path.join(" > "),
block.text,
chunk_type(&block.kind),
block.start_line as i64,
block.end_line as i64,
estimate_tokens(&block.text) as i64,
sha256_hex(&block.text),
chunk.index as i64,
chunk.heading_path.join(" > "),
chunk.content,
chunk_type_name(&chunk.chunk_type),
chunk.start_line as i64,
chunk.end_line as i64,
chunk.token_estimate as i64,
chunk.content_hash,
],
)?;
let chunk_id = conn.last_insert_rowid();
insert_chunk_fts(conn, chunk_id, note, chunk)?;
summary.chunks_written += 1;
}
Ok(())
}
fn insert_chunk_fts(
conn: &Connection,
chunk_id: i64,
note: &NoteMetadata,
chunk: &crate::chunk::NoteChunk,
) -> Result<()> {
conn.execute(
"INSERT INTO chunks_fts (rowid, content, path, title, heading_path) VALUES (?1, ?2, ?3, ?4, ?5)",
params![
chunk_id,
chunk.content,
path_to_db(&note.path),
note.title,
chunk.heading_path.join(" > "),
],
)?;
Ok(())
}
fn delete_note_fts(conn: &Connection, note_id: i64) -> Result<()> {
let mut stmt = conn.prepare("SELECT id FROM chunks WHERE note_id = ?1")?;
let chunk_ids = stmt
.query_map([note_id], |row| row.get::<_, i64>(0))?
.collect::<rusqlite::Result<Vec<_>>>()?;
for chunk_id in chunk_ids {
conn.execute("DELETE FROM chunks_fts WHERE rowid = ?1", [chunk_id])?;
}
Ok(())
}
fn rebuild_fts_if_empty(conn: &Connection) -> Result<()> {
let fts_count: i64 = conn.query_row("SELECT count(*) FROM chunks_fts", [], |row| row.get(0))?;
let chunk_count: i64 = conn.query_row("SELECT count(*) FROM chunks", [], |row| row.get(0))?;
if fts_count > 0 || chunk_count == 0 {
return Ok(());
}
conn.execute(
r#"
INSERT INTO chunks_fts (rowid, content, path, title, heading_path)
SELECT chunks.id, chunks.content, notes.path, notes.title, chunks.heading_path
FROM chunks
JOIN notes ON notes.id = chunks.note_id
"#,
[],
)?;
Ok(())
}
fn insert_tags(
conn: &Connection,
note_id: i64,
@ -259,19 +398,16 @@ pub fn sha256_hex(content: &str) -> String {
format!("{:x}", Sha256::digest(content.as_bytes()))
}
fn estimate_tokens(content: &str) -> usize {
content.split_whitespace().count().max(1)
}
fn chunk_type(kind: &MarkdownBlockKind) -> &'static str {
match kind {
MarkdownBlockKind::Heading => "heading",
MarkdownBlockKind::Paragraph => "paragraph",
MarkdownBlockKind::CodeBlock => "code_block",
MarkdownBlockKind::List => "list",
}
}
fn path_to_db(path: &Path) -> String {
PathBuf::from(path).to_string_lossy().replace('\\', "/")
}
fn fts_query(query: &str) -> String {
query
.split_whitespace()
.map(|term| term.trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-'))
.filter(|term| !term.is_empty())
.map(|term| format!("\"{}\"", term.replace('"', "\"\"")))
.collect::<Vec<_>>()
.join(" ")
}

View File

@ -1,3 +1,4 @@
mod chunk;
mod cli;
mod config;
mod db;
@ -11,7 +12,7 @@ use tracing::{debug, info};
use crate::cli::{Cli, Commands, OutputFormat};
use crate::config::Config;
use crate::db::IndexStore;
use crate::db::{IndexStore, SearchHit};
use crate::vault::VaultIndex;
fn main() -> Result<()> {
@ -54,21 +55,15 @@ fn main() -> Result<()> {
limit,
output,
} => {
let index = VaultIndex::scan(&config)?;
let results = index.search(&query, limit);
let db_path = ensure_index_cache(&config)?;
let store = IndexStore::open(&db_path)?;
let results = store.search(&query, limit)?;
match output {
OutputFormat::Text => {
if results.is_empty() {
println!("No matches.");
}
for (position, result) in results.iter().enumerate() {
println!("{}. {}", position + 1, result.note.path.display());
println!(" title: {}", result.note.title);
if !result.note.headings.is_empty() {
println!(" headings: {}", result.note.headings.join(" > "));
}
println!(" score: {}", result.score);
}
print_search_results(&results);
}
OutputFormat::Json => println!("{}", serde_json::to_string_pretty(&results)?),
}
@ -105,3 +100,29 @@ fn init_project(config: &Config, force: bool) -> Result<()> {
println!("Config: {}", Config::default_path().display());
Ok(())
}
fn ensure_index_cache(config: &Config) -> Result<std::path::PathBuf> {
let db_path = config.vault.path.join(&config.database.path);
if db_path.exists() {
return Ok(db_path);
}
let index = VaultIndex::scan(config)?;
config.create_agent_dirs()?;
let mut store = IndexStore::open(&db_path)?;
store.write_index(&index)?;
Ok(db_path)
}
fn print_search_results(results: &[SearchHit]) {
for (position, result) in results.iter().enumerate() {
println!("{}. {}", position + 1, result.path);
println!(" title: {}", result.title);
if !result.heading_path.is_empty() {
println!(" heading: {}", result.heading_path);
}
println!(" tokens: {}", result.token_estimate);
println!(" score: {:.4}", result.score);
println!(" {}", result.snippet);
}
}

View File

@ -8,6 +8,7 @@ use serde::Serialize;
use tracing::{debug, warn};
use walkdir::{DirEntry, WalkDir};
use crate::chunk::{NoteChunk, build_chunks};
use crate::config::Config;
use crate::db::sha256_hex;
use crate::markdown::{MarkdownBlock, Wikilink, parse_markdown};
@ -30,6 +31,7 @@ pub struct NoteMetadata {
pub content_hash: String,
pub headings: Vec<String>,
pub blocks: Vec<MarkdownBlock>,
pub chunks: Vec<NoteChunk>,
pub wikilinks: Vec<Wikilink>,
pub tags: Vec<String>,
}
@ -41,6 +43,7 @@ pub struct IndexSummary {
pub markdown_files: usize,
pub headings: usize,
pub blocks: usize,
pub chunks: usize,
pub wikilinks: usize,
pub tags: usize,
pub skipped_dirs: Vec<PathBuf>,
@ -99,7 +102,7 @@ impl VaultIndex {
continue;
}
let note = read_note(entry.path(), &config.vault.path)?;
let note = read_note(entry.path(), &config.vault.path, config)?;
debug!(
path = %note.path.display(),
title = %note.title,
@ -129,6 +132,7 @@ impl VaultIndex {
markdown_files: self.markdown_count,
headings: self.notes.iter().map(|note| note.headings.len()).sum(),
blocks: self.notes.iter().map(|note| note.blocks.len()).sum(),
chunks: self.notes.iter().map(|note| note.chunks.len()).sum(),
wikilinks: self.notes.iter().map(|note| note.wikilinks.len()).sum(),
tags: self.notes.iter().map(|note| note.tags.len()).sum(),
skipped_dirs: self.skipped_dirs.clone(),
@ -236,6 +240,7 @@ impl fmt::Display for IndexSummary {
writeln!(f, "Markdown files: {}", self.markdown_files)?;
writeln!(f, "Headings parsed: {}", self.headings)?;
writeln!(f, "Markdown blocks: {}", self.blocks)?;
writeln!(f, "Chunks: {}", self.chunks)?;
writeln!(f, "Wikilinks: {}", self.wikilinks)?;
writeln!(f, "Tags: {}", self.tags)?;
writeln!(f, "Skipped dirs: {}", self.skipped_dirs.len())?;
@ -248,7 +253,7 @@ impl fmt::Display for IndexSummary {
}
}
fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
fn read_note(path: &Path, vault_path: &Path, config: &Config) -> Result<NoteMetadata> {
let content =
fs::read_to_string(path).with_context(|| format!("failed to read {}", path.display()))?;
let metadata =
@ -256,6 +261,11 @@ fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
let relative_path = path.strip_prefix(vault_path).unwrap_or(path).to_path_buf();
let source_path = relative_path.to_string_lossy().replace('\\', "/");
let parsed = parse_markdown(&source_path, &content);
let chunks = build_chunks(
&parsed.blocks,
config.index.chunk_target_tokens,
config.index.chunk_overlap_tokens,
);
let content_hash = sha256_hex(&content);
Ok(NoteMetadata {
@ -275,6 +285,7 @@ fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
content_hash,
headings: parsed.headings,
blocks: parsed.blocks,
chunks,
wikilinks: parsed.wikilinks,
tags: parsed.tags,
})