From 15854cc91eafcd73925e35e239b3e6e925207b16 Mon Sep 17 00:00:00 2001
From: "K. Hodges" <khodges42@gmail.com>
Date: Sun, 24 May 2026 03:09:13 -0700
Subject: [PATCH] Add heading chunks and SQLite FTS search

---
 docs/dev/tasks.md |  10 +--
 src/chunk.rs      | 210 ++++++++++++++++++++++++++++++++++++++++++++++
 src/db.rs         | 206 +++++++++++++++++++++++++++++++++++++--------
 src/main.rs       |  43 +++++++---
 src/vault.rs      |  15 +++-
 5 files changed, 431 insertions(+), 53 deletions(-)
 create mode 100644 src/chunk.rs
diff --git a/docs/dev/tasks.md b/docs/dev/tasks.md
index b744ccb..b42be62 100644
--- a/docs/dev/tasks.md
+++ b/docs/dev/tasks.md
@@ -327,7 +327,7 @@ Detect changed notes efficiently.
 
 # Phase 4 — Chunking
 
-## [ ] GM-016 — Implement heading-based chunking
+## [x] GM-016 — Implement heading-based chunking
 
 ### Goals
 Split notes into useful retrieval units.
@@ -344,7 +344,7 @@ Split notes into useful retrieval units.
 
 ---
 
-## [ ] GM-017 — Add fallback chunk splitting
+## [x] GM-017 — Add fallback chunk splitting
 
 ### Goals
 Handle giant sections safely.
@@ -360,7 +360,7 @@ Handle giant sections safely.
 
 ---
 
-## [ ] GM-018 — Estimate token counts
+## [x] GM-018 — Estimate token counts
 
 ### Goals
 Prepare for LLM context budgeting.
@@ -378,7 +378,7 @@ Prepare for LLM context budgeting.
 
 # Phase 5 — Search
 
-## [ ] GM-019 — Implement SQLite FTS search
+## [x] GM-019 — Implement SQLite FTS search
 
 ### Goals
 Add keyword search.
@@ -396,7 +396,7 @@ Add keyword search.
 
 ---
 
-## [ ] GM-020 — Implement basic CLI search command
+## [x] GM-020 — Implement basic CLI search command
 
 ### Goals
 Expose usable search interface.
diff --git a/src/chunk.rs b/src/chunk.rs
new file mode 100644
index 0000000..a87ef8b
--- /dev/null
+++ b/src/chunk.rs
@@ -0,0 +1,210 @@
+use serde::Serialize;
+
+use crate::db::sha256_hex;
+use crate::markdown::{MarkdownBlock, MarkdownBlockKind};
+
+#[derive(Clone, Debug, Serialize)]
+pub struct NoteChunk {
+    pub index: usize,
+    pub heading_path: Vec<String>,
+    pub content: String,
+    pub chunk_type: ChunkType,
+    pub start_line: usize,
+    pub end_line: usize,
+    pub token_estimate: usize,
+    pub content_hash: String,
+}
+
+#[derive(Clone, Debug, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ChunkType {
+    HeadingSection,
+    SplitSection,
+}
+
+pub fn build_chunks(
+    blocks: &[MarkdownBlock],
+    target_tokens: usize,
+    overlap_tokens: usize,
+) -> Vec<NoteChunk> {
+    let mut chunks = Vec::new();
+    let mut current: Vec<MarkdownBlock> = Vec::new();
+
+    for block in blocks {
+        if matches!(block.kind, MarkdownBlockKind::Heading) && !current.is_empty() {
+            push_section_chunks(&mut chunks, &current, target_tokens, overlap_tokens);
+            current.clear();
+        }
+        current.push(block.clone());
+    }
+
+    if !current.is_empty() {
+        push_section_chunks(&mut chunks, &current, target_tokens, overlap_tokens);
+    }
+
+    for (index, chunk) in chunks.iter_mut().enumerate() {
+        chunk.index = index;
+    }
+
+    chunks
+}
+
+fn push_section_chunks(
+    chunks: &mut Vec<NoteChunk>,
+    section: &[MarkdownBlock],
+    target_tokens: usize,
+    overlap_tokens: usize,
+) {
+    let text = section_text(section);
+    if text.trim().is_empty() {
+        return;
+    }
+
+    let token_estimate = estimate_tokens(&text);
+    let heading_path = section
+        .iter()
+        .rev()
+        .find(|block| !block.heading_path.is_empty())
+        .map(|block| block.heading_path.clone())
+        .unwrap_or_default();
+    let start_line = section.first().map_or(1, |block| block.start_line);
+    let end_line = section.last().map_or(start_line, |block| block.end_line);
+
+    if token_estimate <= target_tokens {
+        chunks.push(NoteChunk {
+            index: 0,
+            heading_path,
+            content_hash: sha256_hex(&text),
+            content: text,
+            chunk_type: ChunkType::HeadingSection,
+            start_line,
+            end_line,
+            token_estimate,
+        });
+        return;
+    }
+
+    // Big sections get split by rough words first. Good enough for now, easy to inspect later.
+    for part in split_with_overlap(&text, target_tokens, overlap_tokens) {
+        let token_estimate = estimate_tokens(&part);
+        chunks.push(NoteChunk {
+            index: 0,
+            heading_path: heading_path.clone(),
+            content_hash: sha256_hex(&part),
+            content: part,
+            chunk_type: ChunkType::SplitSection,
+            start_line,
+            end_line,
+            token_estimate,
+        });
+    }
+}
+
+fn section_text(section: &[MarkdownBlock]) -> String {
+    section
+        .iter()
+        .map(|block| block.text.trim())
+        .filter(|text| !text.is_empty())
+        .collect::<Vec<_>>()
+        .join("\n\n")
+}
+
+pub fn estimate_tokens(content: &str) -> usize {
+    let words = content.split_whitespace().count();
+    words.max(1)
+}
+
+fn split_with_overlap(content: &str, target_tokens: usize, overlap_tokens: usize) -> Vec<String> {
+    let words: Vec<_> = content.split_whitespace().collect();
+    if words.is_empty() {
+        return Vec::new();
+    }
+
+    let mut parts = Vec::new();
+    let mut start = 0;
+    let step = target_tokens.saturating_sub(overlap_tokens).max(1);
+
+    while start < words.len() {
+        let end = (start + target_tokens).min(words.len());
+        let part = words[start..end].join(" ");
+        parts.push(trim_to_sentenceish_boundary(part));
+        if end == words.len() {
+            break;
+        }
+        start += step;
+    }
+
+    parts
+}
+
+fn trim_to_sentenceish_boundary(part: String) -> String {
+    if part.ends_with('.') || part.ends_with('!') || part.ends_with('?') || part.len() < 240 {
+        return part;
+    }
+
+    match part.rfind(['.', '!', '?']) {
+        Some(idx) if idx > part.len() / 2 => part[..=idx].to_string(),
+        _ => part,
+    }
+}
+
+pub fn chunk_type_name(kind: &ChunkType) -> &'static str {
+    match kind {
+        ChunkType::HeadingSection => "heading_section",
+        ChunkType::SplitSection => "split_section",
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::markdown::{MarkdownBlock, MarkdownBlockKind};
+
+    use super::{ChunkType, build_chunks};
+
+    #[test]
+    fn builds_heading_chunks_in_order() {
+        let blocks = vec![
+            block(MarkdownBlockKind::Heading, "A", 1, vec!["A"]),
+            block(MarkdownBlockKind::Paragraph, "one", 2, vec!["A"]),
+            block(MarkdownBlockKind::Heading, "B", 3, vec!["B"]),
+            block(MarkdownBlockKind::Paragraph, "two", 4, vec!["B"]),
+        ];
+
+        let chunks = build_chunks(&blocks, 100, 10);
+
+        assert_eq!(chunks.len(), 2);
+        assert_eq!(chunks[0].heading_path, vec!["A"]);
+        assert_eq!(chunks[1].heading_path, vec!["B"]);
+        assert!(matches!(chunks[0].chunk_type, ChunkType::HeadingSection));
+    }
+
+    #[test]
+    fn splits_large_sections_with_overlap() {
+        let text = (0..30)
+            .map(|idx| format!("word{idx}"))
+            .collect::<Vec<_>>()
+            .join(" ");
+        let blocks = vec![block(MarkdownBlockKind::Paragraph, &text, 1, vec![])];
+
+        let chunks = build_chunks(&blocks, 10, 2);
+
+        assert!(chunks.len() > 1);
+        assert!(chunks.iter().all(|chunk| chunk.token_estimate <= 10));
+        assert!(matches!(chunks[1].chunk_type, ChunkType::SplitSection));
+    }
+
+    fn block(
+        kind: MarkdownBlockKind,
+        text: &str,
+        line: usize,
+        heading_path: Vec<&str>,
+    ) -> MarkdownBlock {
+        MarkdownBlock {
+            kind,
+            text: text.to_string(),
+            start_line: line,
+            end_line: line,
+            heading_path: heading_path.into_iter().map(String::from).collect(),
+        }
+    }
+}
diff --git a/src/db.rs b/src/db.rs
index bed5747..43b32ed 100644
--- a/src/db.rs
+++ b/src/db.rs
@@ -6,13 +6,25 @@ use rusqlite::{Connection, OptionalExtension, params};
 use sha2::{Digest, Sha256};
 use tracing::debug;
 
-use crate::markdown::MarkdownBlockKind;
+use crate::chunk::chunk_type_name;
 use crate::vault::{IndexWriteSummary, NoteMetadata, VaultIndex};
 
+const INDEX_VERSION: i64 = 2;
+
 pub struct IndexStore {
     conn: Connection,
 }
 
+#[derive(Clone, Debug, serde::Serialize)]
+pub struct SearchHit {
+    pub path: String,
+    pub title: String,
+    pub heading_path: String,
+    pub snippet: String,
+    pub score: f64,
+    pub token_estimate: usize,
+}
+
 impl IndexStore {
     pub fn open(path: &Path) -> Result<Self> {
         if let Some(parent) = path.parent() {
@@ -34,8 +46,8 @@ impl IndexStore {
         // This is a rebuildable cache, so changed notes get their child rows replaced in place.
         for note in &index.notes {
             summary.notes_seen += 1;
-            let existing_hash = existing_note_hash(&tx, &note.path)?;
-            if existing_hash.as_deref() == Some(note.content_hash.as_str()) {
+            let fresh = existing_note_fresh(&tx, &note.path, &note.content_hash)?;
+            if fresh {
                 summary.unchanged_notes += 1;
                 debug!(path = %note.path.display(), "skipping unchanged note");
                 continue;
@@ -49,10 +61,51 @@ impl IndexStore {
             insert_links(&tx, note_id, note, &mut summary)?;
         }
 
+        rebuild_fts_if_empty(&tx)?;
         tx.commit()?;
         Ok(summary)
     }
 
+    pub fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchHit>> {
+        let fts_query = fts_query(query);
+        if fts_query.is_empty() {
+            return Ok(Vec::new());
+        }
+
+        let mut stmt = self.conn.prepare(
+            r#"
+            SELECT
+                notes.path,
+                notes.title,
+                chunks.heading_path,
+                snippet(chunks_fts, 0, '[', ']', '...', 18) AS snippet,
+                bm25(chunks_fts) AS score,
+                chunks.token_estimate
+            FROM chunks_fts
+            JOIN chunks ON chunks.id = chunks_fts.rowid
+            JOIN notes ON notes.id = chunks.note_id
+            WHERE chunks_fts MATCH ?1
+            ORDER BY score
+            LIMIT ?2
+            "#,
+        )?;
+
+        let hits = stmt
+            .query_map(params![fts_query, limit as i64], |row| {
+                Ok(SearchHit {
+                    path: row.get(0)?,
+                    title: row.get(1)?,
+                    heading_path: row.get(2)?,
+                    snippet: row.get(3)?,
+                    score: -row.get::<_, f64>(4)?,
+                    token_estimate: row.get::<_, i64>(5)? as usize,
+                })
+            })?
+            .collect::<rusqlite::Result<Vec<_>>>()?;
+
+        Ok(hits)
+    }
+
     fn bootstrap(&self) -> Result<()> {
         self.conn.execute_batch(
             r#"
@@ -72,6 +125,7 @@ impl IndexStore {
                 modified_unix_secs INTEGER,
                 file_size INTEGER NOT NULL,
                 content_hash TEXT NOT NULL,
+                index_version INTEGER NOT NULL DEFAULT 2,
                 created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
                 updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
             );
@@ -91,6 +145,14 @@ impl IndexStore {
                 UNIQUE(note_id, chunk_index)
             );
 
+            CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts
+            USING fts5(
+                content,
+                path UNINDEXED,
+                title UNINDEXED,
+                heading_path UNINDEXED
+            );
+
             CREATE TABLE IF NOT EXISTS tags (
                 id INTEGER PRIMARY KEY,
                 name TEXT NOT NULL UNIQUE
@@ -116,18 +178,39 @@ impl IndexStore {
             INSERT OR IGNORE INTO migrations (id, name) VALUES (1, 'initial_metadata_index');
             "#,
         )?;
+        ensure_notes_index_version(&self.conn)?;
         Ok(())
     }
 }
 
-fn existing_note_hash(conn: &Connection, path: &Path) -> Result<Option<String>> {
-    conn.query_row(
-        "SELECT content_hash FROM notes WHERE path = ?1",
-        [path_to_db(path)],
-        |row| row.get(0),
-    )
-    .optional()
-    .context("failed to read existing note hash")
+fn existing_note_fresh(conn: &Connection, path: &Path, content_hash: &str) -> Result<bool> {
+    let existing = conn
+        .query_row(
+            "SELECT content_hash, index_version FROM notes WHERE path = ?1",
+            [path_to_db(path)],
+            |row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)),
+        )
+        .optional()
+        .context("failed to read existing note freshness")?;
+
+    Ok(existing.is_some_and(|(hash, version)| hash == content_hash && version == INDEX_VERSION))
+}
+
+fn ensure_notes_index_version(conn: &Connection) -> Result<()> {
+    let mut stmt = conn.prepare("PRAGMA table_info(notes)")?;
+    let columns = stmt
+        .query_map([], |row| row.get::<_, String>(1))?
+        .collect::<rusqlite::Result<Vec<_>>>()?;
+
+    if !columns.iter().any(|column| column == "index_version") {
+        conn.execute(
+            "ALTER TABLE notes ADD COLUMN index_version INTEGER NOT NULL DEFAULT 1",
+            [],
+        )
+        .context("failed to add notes.index_version")?;
+    }
+
+    Ok(())
 }
 
 fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
@@ -140,15 +223,17 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
             modified_unix_secs,
             file_size,
             content_hash,
+            index_version,
             updated_at
         )
-        VALUES (?1, ?2, ?3, ?4, ?5, ?6, CURRENT_TIMESTAMP)
+        VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, CURRENT_TIMESTAMP)
         ON CONFLICT(path) DO UPDATE SET
             filename = excluded.filename,
             title = excluded.title,
             modified_unix_secs = excluded.modified_unix_secs,
             file_size = excluded.file_size,
             content_hash = excluded.content_hash,
+            index_version = excluded.index_version,
             updated_at = CURRENT_TIMESTAMP
         "#,
         params![
@@ -158,6 +243,7 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
             note.modified_unix_secs,
             note.file_size,
             note.content_hash,
+            INDEX_VERSION,
         ],
     )?;
 
@@ -170,6 +256,7 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
 }
 
 fn clear_note_children(conn: &Connection, note_id: i64) -> Result<()> {
+    delete_note_fts(conn, note_id)?;
     conn.execute("DELETE FROM chunks WHERE note_id = ?1", [note_id])?;
     conn.execute("DELETE FROM note_tags WHERE note_id = ?1", [note_id])?;
     conn.execute("DELETE FROM links WHERE source_note_id = ?1", [note_id])?;
@@ -182,8 +269,8 @@ fn insert_chunks(
     note: &NoteMetadata,
     summary: &mut IndexWriteSummary,
 ) -> Result<()> {
-    for (idx, block) in note.blocks.iter().enumerate() {
-        if block.text.trim().is_empty() {
+    for chunk in &note.chunks {
+        if chunk.content.trim().is_empty() {
             continue;
         }
 
@@ -204,21 +291,73 @@ fn insert_chunks(
             "#,
             params![
                 note_id,
-                idx as i64,
-                block.heading_path.join(" > "),
-                block.text,
-                chunk_type(&block.kind),
-                block.start_line as i64,
-                block.end_line as i64,
-                estimate_tokens(&block.text) as i64,
-                sha256_hex(&block.text),
+                chunk.index as i64,
+                chunk.heading_path.join(" > "),
+                chunk.content,
+                chunk_type_name(&chunk.chunk_type),
+                chunk.start_line as i64,
+                chunk.end_line as i64,
+                chunk.token_estimate as i64,
+                chunk.content_hash,
             ],
         )?;
+        let chunk_id = conn.last_insert_rowid();
+        insert_chunk_fts(conn, chunk_id, note, chunk)?;
         summary.chunks_written += 1;
     }
     Ok(())
 }
 
+fn insert_chunk_fts(
+    conn: &Connection,
+    chunk_id: i64,
+    note: &NoteMetadata,
+    chunk: &crate::chunk::NoteChunk,
+) -> Result<()> {
+    conn.execute(
+        "INSERT INTO chunks_fts (rowid, content, path, title, heading_path) VALUES (?1, ?2, ?3, ?4, ?5)",
+        params![
+            chunk_id,
+            chunk.content,
+            path_to_db(&note.path),
+            note.title,
+            chunk.heading_path.join(" > "),
+        ],
+    )?;
+    Ok(())
+}
+
+fn delete_note_fts(conn: &Connection, note_id: i64) -> Result<()> {
+    let mut stmt = conn.prepare("SELECT id FROM chunks WHERE note_id = ?1")?;
+    let chunk_ids = stmt
+        .query_map([note_id], |row| row.get::<_, i64>(0))?
+        .collect::<rusqlite::Result<Vec<_>>>()?;
+
+    for chunk_id in chunk_ids {
+        conn.execute("DELETE FROM chunks_fts WHERE rowid = ?1", [chunk_id])?;
+    }
+    Ok(())
+}
+
+fn rebuild_fts_if_empty(conn: &Connection) -> Result<()> {
+    let fts_count: i64 = conn.query_row("SELECT count(*) FROM chunks_fts", [], |row| row.get(0))?;
+    let chunk_count: i64 = conn.query_row("SELECT count(*) FROM chunks", [], |row| row.get(0))?;
+    if fts_count > 0 || chunk_count == 0 {
+        return Ok(());
+    }
+
+    conn.execute(
+        r#"
+        INSERT INTO chunks_fts (rowid, content, path, title, heading_path)
+        SELECT chunks.id, chunks.content, notes.path, notes.title, chunks.heading_path
+        FROM chunks
+        JOIN notes ON notes.id = chunks.note_id
+        "#,
+        [],
+    )?;
+    Ok(())
+}
+
 fn insert_tags(
     conn: &Connection,
     note_id: i64,
@@ -259,19 +398,16 @@ pub fn sha256_hex(content: &str) -> String {
     format!("{:x}", Sha256::digest(content.as_bytes()))
 }
 
-fn estimate_tokens(content: &str) -> usize {
-    content.split_whitespace().count().max(1)
-}
-
-fn chunk_type(kind: &MarkdownBlockKind) -> &'static str {
-    match kind {
-        MarkdownBlockKind::Heading => "heading",
-        MarkdownBlockKind::Paragraph => "paragraph",
-        MarkdownBlockKind::CodeBlock => "code_block",
-        MarkdownBlockKind::List => "list",
-    }
-}
-
 fn path_to_db(path: &Path) -> String {
     PathBuf::from(path).to_string_lossy().replace('\\', "/")
 }
+
+fn fts_query(query: &str) -> String {
+    query
+        .split_whitespace()
+        .map(|term| term.trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-'))
+        .filter(|term| !term.is_empty())
+        .map(|term| format!("\"{}\"", term.replace('"', "\"\"")))
+        .collect::<Vec<_>>()
+        .join(" ")
+}
diff --git a/src/main.rs b/src/main.rs
index cccf282..c2944f4 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,3 +1,4 @@
+mod chunk;
 mod cli;
 mod config;
 mod db;
@@ -11,7 +12,7 @@ use tracing::{debug, info};
 
 use crate::cli::{Cli, Commands, OutputFormat};
 use crate::config::Config;
-use crate::db::IndexStore;
+use crate::db::{IndexStore, SearchHit};
 use crate::vault::VaultIndex;
 
 fn main() -> Result<()> {
@@ -54,21 +55,15 @@ fn main() -> Result<()> {
             limit,
             output,
         } => {
-            let index = VaultIndex::scan(&config)?;
-            let results = index.search(&query, limit);
+            let db_path = ensure_index_cache(&config)?;
+            let store = IndexStore::open(&db_path)?;
+            let results = store.search(&query, limit)?;
             match output {
                 OutputFormat::Text => {
                     if results.is_empty() {
                         println!("No matches.");
                     }
-                    for (position, result) in results.iter().enumerate() {
-                        println!("{}. {}", position + 1, result.note.path.display());
-                        println!("   title: {}", result.note.title);
-                        if !result.note.headings.is_empty() {
-                            println!("   headings: {}", result.note.headings.join(" > "));
-                        }
-                        println!("   score: {}", result.score);
-                    }
+                    print_search_results(&results);
                 }
                 OutputFormat::Json => println!("{}", serde_json::to_string_pretty(&results)?),
             }
@@ -105,3 +100,29 @@ fn init_project(config: &Config, force: bool) -> Result<()> {
     println!("Config: {}", Config::default_path().display());
     Ok(())
 }
+
+fn ensure_index_cache(config: &Config) -> Result<std::path::PathBuf> {
+    let db_path = config.vault.path.join(&config.database.path);
+    if db_path.exists() {
+        return Ok(db_path);
+    }
+
+    let index = VaultIndex::scan(config)?;
+    config.create_agent_dirs()?;
+    let mut store = IndexStore::open(&db_path)?;
+    store.write_index(&index)?;
+    Ok(db_path)
+}
+
+fn print_search_results(results: &[SearchHit]) {
+    for (position, result) in results.iter().enumerate() {
+        println!("{}. {}", position + 1, result.path);
+        println!("   title: {}", result.title);
+        if !result.heading_path.is_empty() {
+            println!("   heading: {}", result.heading_path);
+        }
+        println!("   tokens: {}", result.token_estimate);
+        println!("   score: {:.4}", result.score);
+        println!("   {}", result.snippet);
+    }
+}
diff --git a/src/vault.rs b/src/vault.rs
index 63feeb2..c508a7f 100644
--- a/src/vault.rs
+++ b/src/vault.rs
@@ -8,6 +8,7 @@ use serde::Serialize;
 use tracing::{debug, warn};
 use walkdir::{DirEntry, WalkDir};
 
+use crate::chunk::{NoteChunk, build_chunks};
 use crate::config::Config;
 use crate::db::sha256_hex;
 use crate::markdown::{MarkdownBlock, Wikilink, parse_markdown};
@@ -30,6 +31,7 @@ pub struct NoteMetadata {
     pub content_hash: String,
     pub headings: Vec<String>,
     pub blocks: Vec<MarkdownBlock>,
+    pub chunks: Vec<NoteChunk>,
     pub wikilinks: Vec<Wikilink>,
     pub tags: Vec<String>,
 }
@@ -41,6 +43,7 @@ pub struct IndexSummary {
     pub markdown_files: usize,
     pub headings: usize,
     pub blocks: usize,
+    pub chunks: usize,
     pub wikilinks: usize,
     pub tags: usize,
     pub skipped_dirs: Vec<PathBuf>,
@@ -99,7 +102,7 @@ impl VaultIndex {
                 continue;
             }
 
-            let note = read_note(entry.path(), &config.vault.path)?;
+            let note = read_note(entry.path(), &config.vault.path, config)?;
             debug!(
                 path = %note.path.display(),
                 title = %note.title,
@@ -129,6 +132,7 @@ impl VaultIndex {
             markdown_files: self.markdown_count,
             headings: self.notes.iter().map(|note| note.headings.len()).sum(),
             blocks: self.notes.iter().map(|note| note.blocks.len()).sum(),
+            chunks: self.notes.iter().map(|note| note.chunks.len()).sum(),
             wikilinks: self.notes.iter().map(|note| note.wikilinks.len()).sum(),
             tags: self.notes.iter().map(|note| note.tags.len()).sum(),
             skipped_dirs: self.skipped_dirs.clone(),
@@ -236,6 +240,7 @@ impl fmt::Display for IndexSummary {
         writeln!(f, "Markdown files: {}", self.markdown_files)?;
         writeln!(f, "Headings parsed: {}", self.headings)?;
         writeln!(f, "Markdown blocks: {}", self.blocks)?;
+        writeln!(f, "Chunks: {}", self.chunks)?;
         writeln!(f, "Wikilinks: {}", self.wikilinks)?;
         writeln!(f, "Tags: {}", self.tags)?;
         writeln!(f, "Skipped dirs: {}", self.skipped_dirs.len())?;
@@ -248,7 +253,7 @@ impl fmt::Display for IndexSummary {
     }
 }
 
-fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
+fn read_note(path: &Path, vault_path: &Path, config: &Config) -> Result<NoteMetadata> {
     let content =
         fs::read_to_string(path).with_context(|| format!("failed to read {}", path.display()))?;
     let metadata =
@@ -256,6 +261,11 @@ fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
     let relative_path = path.strip_prefix(vault_path).unwrap_or(path).to_path_buf();
     let source_path = relative_path.to_string_lossy().replace('\\', "/");
     let parsed = parse_markdown(&source_path, &content);
+    let chunks = build_chunks(
+        &parsed.blocks,
+        config.index.chunk_target_tokens,
+        config.index.chunk_overlap_tokens,
+    );
     let content_hash = sha256_hex(&content);
 
     Ok(NoteMetadata {
@@ -275,6 +285,7 @@ fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
         content_hash,
         headings: parsed.headings,
         blocks: parsed.blocks,
+        chunks,
         wikilinks: parsed.wikilinks,
         tags: parsed.tags,
     })