From 9fb82b532477b1313572b3fffdd4081930b4e18b Mon Sep 17 00:00:00 2001 From: "K. Hodges" Date: Sun, 24 May 2026 02:47:19 -0700 Subject: [PATCH] next chunk of the boring but important indexing layer --- Cargo.lock | 176 ++++++++++++++++++++++++++++- Cargo.toml | 2 + docs/dev/tasks.md | 10 +- glassmind.toml | 3 + src/config.rs | 12 ++ src/db.rs | 277 ++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 12 +- src/markdown.rs | 125 ++++++++++++++++++++- src/vault.rs | 36 +++++- 9 files changed, 639 insertions(+), 14 deletions(-) create mode 100644 src/db.rs diff --git a/Cargo.lock b/Cargo.lock index 55598d6..486cb46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -73,6 +73,25 @@ version = "2.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "cc" +version = "1.2.62" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98" +dependencies = [ + "find-msvc-tools", + "shlex", +] + [[package]] name = "cfg-if" version = "1.0.4" @@ -125,12 +144,75 @@ version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + [[package]] name = "equivalent" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "fallible-iterator" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649" + +[[package]] +name = "fallible-streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + [[package]] name = "getopts" version = "0.2.24" @@ -148,20 +230,40 @@ dependencies = [ "clap", "pulldown-cmark", "regex", + "rusqlite", "serde", "serde_json", + "sha2", "toml", "tracing", "tracing-subscriber", "walkdir", ] +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash", +] + [[package]] name = "hashbrown" version = "0.17.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" +[[package]] +name = "hashlink" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" +dependencies = [ + "hashbrown 0.15.5", +] + [[package]] name = "heck" version = "0.5.0" @@ -175,7 +277,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.17.1", ] [[package]] @@ -196,6 +298,23 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "libsqlite3-sys" +version = "0.35.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "133c182a6a2c87864fe97778797e46c7e999672690dc9fa3ee8e241aa4a9c13f" +dependencies = [ + "cc", + "pkg-config", + "vcpkg", +] + [[package]] name = "log" version = "0.4.29" @@ -244,6 +363,12 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +[[package]] +name = "pkg-config" +version = "0.3.33" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e" + [[package]] name = "proc-macro2" version = "1.0.106" @@ -310,6 +435,20 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "rusqlite" +version = "0.37.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "165ca6e57b20e1351573e3729b958bc62f0e48025386970b6e4d29e7a7e71f3f" +dependencies = [ + "bitflags", + "fallible-iterator", + "fallible-streaming-iterator", + "hashlink", + "libsqlite3-sys", + "smallvec", +] + [[package]] name = "same-file" version = "1.0.6" @@ -371,6 +510,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + [[package]] name = "sharded-slab" version = "0.1.7" @@ -380,6 +530,12 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "smallvec" version = "1.15.1" @@ -512,6 +668,12 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "typenum" +version = "1.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de" + [[package]] name = "unicase" version = "2.9.0" @@ -542,6 +704,18 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.5.0" diff --git a/Cargo.toml b/Cargo.toml index 06cb75a..fe369d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,8 +8,10 @@ anyhow = "1.0" clap = { version = "4.5", features = ["derive"] } pulldown-cmark = "0.13" regex = "1.11" +rusqlite = { version = "0.37", features = ["bundled"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +sha2 = "0.10" toml = "0.9" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] } diff --git a/docs/dev/tasks.md b/docs/dev/tasks.md index eb1c17c..b744ccb 100644 --- a/docs/dev/tasks.md +++ b/docs/dev/tasks.md @@ -232,7 +232,7 @@ Store: --- -## [ ] GM-011 — Extract tags +## [x] GM-011 — Extract tags ### Goals Parse tags from notes. @@ -254,7 +254,7 @@ Normalize: # Phase 3 — Database Layer -## [ ] GM-012 — Add SQLite integration +## [x] GM-012 — Add SQLite integration ### Goals Create local metadata database. @@ -271,7 +271,7 @@ Create local metadata database. --- -## [ ] GM-013 — Create notes table +## [x] GM-013 — Create notes table ### Goals Store note metadata. @@ -289,7 +289,7 @@ Create schema for: --- -## [ ] GM-014 — Create chunks table +## [x] GM-014 — Create chunks table ### Goals Store retrieval chunks. @@ -308,7 +308,7 @@ Store: --- -## [ ] GM-015 — Add content hashing +## [x] GM-015 — Add content hashing ### Goals Detect changed notes efficiently. diff --git a/glassmind.toml b/glassmind.toml index fc1ca40..df59c2c 100644 --- a/glassmind.toml +++ b/glassmind.toml @@ -1,6 +1,9 @@ [vault] path = "." +[database] +path = ".agent/cache/glassmind.sqlite3" + [index] include_agent_dir = true ignore_dirs = [ diff --git a/src/config.rs b/src/config.rs index ec8d3e3..17d3e7a 100644 --- a/src/config.rs +++ b/src/config.rs @@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize}; #[derive(Clone, Debug, Deserialize, Serialize)] pub struct Config { pub vault: VaultConfig, + pub database: DatabaseConfig, pub index: IndexConfig, pub embeddings: EmbeddingsConfig, pub search: SearchConfig, @@ -19,6 +20,11 @@ pub struct VaultConfig { pub path: PathBuf, } +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct DatabaseConfig { + pub path: PathBuf, +} + #[derive(Clone, Debug, Deserialize, Serialize)] pub struct IndexConfig { pub include_agent_dir: bool, @@ -93,6 +99,9 @@ impl Config { if self.server.port == 0 { bail!("server.port must be greater than zero"); } + if self.database.path.as_os_str().is_empty() { + bail!("database.path must not be empty"); + } match self.writes.mode.as_str() { "off" | "agent-only" | "propose" | "allow" => {} other => { @@ -138,6 +147,9 @@ impl Default for Config { vault: VaultConfig { path: PathBuf::from("."), }, + database: DatabaseConfig { + path: PathBuf::from(".agent/cache/glassmind.sqlite3"), + }, index: IndexConfig { include_agent_dir: true, ignore_dirs: vec![ diff --git a/src/db.rs b/src/db.rs new file mode 100644 index 0000000..bed5747 --- /dev/null +++ b/src/db.rs @@ -0,0 +1,277 @@ +use std::fs; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result}; +use rusqlite::{Connection, OptionalExtension, params}; +use sha2::{Digest, Sha256}; +use tracing::debug; + +use crate::markdown::MarkdownBlockKind; +use crate::vault::{IndexWriteSummary, NoteMetadata, VaultIndex}; + +pub struct IndexStore { + conn: Connection, +} + +impl IndexStore { + pub fn open(path: &Path) -> Result { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent) + .with_context(|| format!("failed to create db dir {}", parent.display()))?; + } + + let conn = Connection::open(path) + .with_context(|| format!("failed to open sqlite db {}", path.display()))?; + let store = Self { conn }; + store.bootstrap()?; + Ok(store) + } + + pub fn write_index(&mut self, index: &VaultIndex) -> Result { + let tx = self.conn.transaction()?; + let mut summary = IndexWriteSummary::default(); + + // This is a rebuildable cache, so changed notes get their child rows replaced in place. + for note in &index.notes { + summary.notes_seen += 1; + let existing_hash = existing_note_hash(&tx, ¬e.path)?; + if existing_hash.as_deref() == Some(note.content_hash.as_str()) { + summary.unchanged_notes += 1; + debug!(path = %note.path.display(), "skipping unchanged note"); + continue; + } + + summary.changed_notes += 1; + let note_id = upsert_note(&tx, note)?; + clear_note_children(&tx, note_id)?; + insert_chunks(&tx, note_id, note, &mut summary)?; + insert_tags(&tx, note_id, note, &mut summary)?; + insert_links(&tx, note_id, note, &mut summary)?; + } + + tx.commit()?; + Ok(summary) + } + + fn bootstrap(&self) -> Result<()> { + self.conn.execute_batch( + r#" + PRAGMA foreign_keys = ON; + + CREATE TABLE IF NOT EXISTS migrations ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE, + applied_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS notes ( + id INTEGER PRIMARY KEY, + path TEXT NOT NULL UNIQUE, + filename TEXT NOT NULL, + title TEXT NOT NULL, + modified_unix_secs INTEGER, + file_size INTEGER NOT NULL, + content_hash TEXT NOT NULL, + created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP + ); + + CREATE TABLE IF NOT EXISTS chunks ( + id INTEGER PRIMARY KEY, + note_id INTEGER NOT NULL, + chunk_index INTEGER NOT NULL, + heading_path TEXT NOT NULL, + content TEXT NOT NULL, + chunk_type TEXT NOT NULL, + start_line INTEGER NOT NULL, + end_line INTEGER NOT NULL, + token_estimate INTEGER NOT NULL, + content_hash TEXT NOT NULL, + FOREIGN KEY(note_id) REFERENCES notes(id) ON DELETE CASCADE, + UNIQUE(note_id, chunk_index) + ); + + CREATE TABLE IF NOT EXISTS tags ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL UNIQUE + ); + + CREATE TABLE IF NOT EXISTS note_tags ( + note_id INTEGER NOT NULL, + tag_id INTEGER NOT NULL, + FOREIGN KEY(note_id) REFERENCES notes(id) ON DELETE CASCADE, + FOREIGN KEY(tag_id) REFERENCES tags(id) ON DELETE CASCADE, + PRIMARY KEY(note_id, tag_id) + ); + + CREATE TABLE IF NOT EXISTS links ( + id INTEGER PRIMARY KEY, + source_note_id INTEGER NOT NULL, + target TEXT NOT NULL, + alias TEXT, + link_type TEXT NOT NULL DEFAULT 'wikilink', + FOREIGN KEY(source_note_id) REFERENCES notes(id) ON DELETE CASCADE + ); + + INSERT OR IGNORE INTO migrations (id, name) VALUES (1, 'initial_metadata_index'); + "#, + )?; + Ok(()) + } +} + +fn existing_note_hash(conn: &Connection, path: &Path) -> Result> { + conn.query_row( + "SELECT content_hash FROM notes WHERE path = ?1", + [path_to_db(path)], + |row| row.get(0), + ) + .optional() + .context("failed to read existing note hash") +} + +fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result { + conn.execute( + r#" + INSERT INTO notes ( + path, + filename, + title, + modified_unix_secs, + file_size, + content_hash, + updated_at + ) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, CURRENT_TIMESTAMP) + ON CONFLICT(path) DO UPDATE SET + filename = excluded.filename, + title = excluded.title, + modified_unix_secs = excluded.modified_unix_secs, + file_size = excluded.file_size, + content_hash = excluded.content_hash, + updated_at = CURRENT_TIMESTAMP + "#, + params![ + path_to_db(¬e.path), + note.filename, + note.title, + note.modified_unix_secs, + note.file_size, + note.content_hash, + ], + )?; + + conn.query_row( + "SELECT id FROM notes WHERE path = ?1", + [path_to_db(¬e.path)], + |row| row.get(0), + ) + .context("failed to read upserted note id") +} + +fn clear_note_children(conn: &Connection, note_id: i64) -> Result<()> { + conn.execute("DELETE FROM chunks WHERE note_id = ?1", [note_id])?; + conn.execute("DELETE FROM note_tags WHERE note_id = ?1", [note_id])?; + conn.execute("DELETE FROM links WHERE source_note_id = ?1", [note_id])?; + Ok(()) +} + +fn insert_chunks( + conn: &Connection, + note_id: i64, + note: &NoteMetadata, + summary: &mut IndexWriteSummary, +) -> Result<()> { + for (idx, block) in note.blocks.iter().enumerate() { + if block.text.trim().is_empty() { + continue; + } + + conn.execute( + r#" + INSERT INTO chunks ( + note_id, + chunk_index, + heading_path, + content, + chunk_type, + start_line, + end_line, + token_estimate, + content_hash + ) + VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9) + "#, + params![ + note_id, + idx as i64, + block.heading_path.join(" > "), + block.text, + chunk_type(&block.kind), + block.start_line as i64, + block.end_line as i64, + estimate_tokens(&block.text) as i64, + sha256_hex(&block.text), + ], + )?; + summary.chunks_written += 1; + } + Ok(()) +} + +fn insert_tags( + conn: &Connection, + note_id: i64, + note: &NoteMetadata, + summary: &mut IndexWriteSummary, +) -> Result<()> { + for tag in ¬e.tags { + conn.execute("INSERT OR IGNORE INTO tags (name) VALUES (?1)", [tag])?; + let tag_id: i64 = conn.query_row("SELECT id FROM tags WHERE name = ?1", [tag], |row| { + row.get(0) + })?; + conn.execute( + "INSERT OR IGNORE INTO note_tags (note_id, tag_id) VALUES (?1, ?2)", + params![note_id, tag_id], + )?; + summary.tags_seen += 1; + } + Ok(()) +} + +fn insert_links( + conn: &Connection, + note_id: i64, + note: &NoteMetadata, + summary: &mut IndexWriteSummary, +) -> Result<()> { + for link in ¬e.wikilinks { + conn.execute( + "INSERT INTO links (source_note_id, target, alias, link_type) VALUES (?1, ?2, ?3, 'wikilink')", + params![note_id, link.target, link.alias], + )?; + summary.links_written += 1; + } + Ok(()) +} + +pub fn sha256_hex(content: &str) -> String { + format!("{:x}", Sha256::digest(content.as_bytes())) +} + +fn estimate_tokens(content: &str) -> usize { + content.split_whitespace().count().max(1) +} + +fn chunk_type(kind: &MarkdownBlockKind) -> &'static str { + match kind { + MarkdownBlockKind::Heading => "heading", + MarkdownBlockKind::Paragraph => "paragraph", + MarkdownBlockKind::CodeBlock => "code_block", + MarkdownBlockKind::List => "list", + } +} + +fn path_to_db(path: &Path) -> String { + PathBuf::from(path).to_string_lossy().replace('\\', "/") +} diff --git a/src/main.rs b/src/main.rs index e31a895..cccf282 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,6 @@ mod cli; mod config; +mod db; mod logging; mod markdown; mod vault; @@ -10,6 +11,7 @@ use tracing::{debug, info}; use crate::cli::{Cli, Commands, OutputFormat}; use crate::config::Config; +use crate::db::IndexStore; use crate::vault::VaultIndex; fn main() -> Result<()> { @@ -25,10 +27,16 @@ fn main() -> Result<()> { Commands::Init { force } => init_project(&config, force), Commands::Index { json } => { let index = VaultIndex::scan(&config)?; + config.create_agent_dirs()?; + // Indexing writes the rebuildable cache, while search can still scan live markdown. + let db_path = config.vault.path.join(&config.database.path); + let mut store = IndexStore::open(&db_path)?; + let writes = store.write_index(&index)?; + let summary = index.summary_with_writes(writes); if json { - println!("{}", serde_json::to_string_pretty(&index.summary())?); + println!("{}", serde_json::to_string_pretty(&summary)?); } else { - println!("{}", index.summary()); + println!("{summary}"); } Ok(()) } diff --git a/src/markdown.rs b/src/markdown.rs index 9e0cd9f..922a17c 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeSet; + use regex::Regex; use serde::Serialize; @@ -6,6 +8,7 @@ pub struct MarkdownDocument { pub headings: Vec, pub blocks: Vec, pub wikilinks: Vec, + pub tags: Vec, } #[derive(Clone, Debug, Serialize)] @@ -14,6 +17,7 @@ pub struct MarkdownBlock { pub text: String, pub start_line: usize, pub end_line: usize, + pub heading_path: Vec, } #[derive(Clone, Debug, Serialize)] @@ -42,11 +46,13 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument { let mut in_code = false; let mut code = Vec::new(); let mut code_start = 0; + let mut heading_stack: Vec<(usize, String)> = Vec::new(); for (idx, line) in content.lines().enumerate() { let line_no = idx + 1; let trimmed = line.trim(); + // Code fences get kept whole so later chunks stay readable. if trimmed.starts_with("```") || trimmed.starts_with("~~~") { if in_code { code.push(line.to_string()); @@ -55,6 +61,7 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument { text: code.join("\n"), start_line: code_start, end_line: line_no, + heading_path: current_heading_path(&heading_stack), }); code.clear(); in_code = false; @@ -64,6 +71,7 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument { &mut paragraph, paragraph_start, line_no.saturating_sub(1), + &heading_stack, ); in_code = true; code_start = line_no; @@ -77,19 +85,28 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument { continue; } - if let Some(heading) = parse_heading(trimmed) { + if let Some((level, heading)) = parse_heading(trimmed) { flush_paragraph( &mut blocks, &mut paragraph, paragraph_start, line_no.saturating_sub(1), + &heading_stack, ); + while heading_stack + .last() + .is_some_and(|(last_level, _)| *last_level >= level) + { + heading_stack.pop(); + } + heading_stack.push((level, heading.clone())); headings.push(heading.clone()); blocks.push(MarkdownBlock { kind: MarkdownBlockKind::Heading, text: heading, start_line: line_no, end_line: line_no, + heading_path: current_heading_path(&heading_stack), }); continue; } @@ -100,12 +117,14 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument { &mut paragraph, paragraph_start, line_no.saturating_sub(1), + &heading_stack, ); blocks.push(MarkdownBlock { kind: MarkdownBlockKind::List, text: trimmed.to_string(), start_line: line_no, end_line: line_no, + heading_path: current_heading_path(&heading_stack), }); continue; } @@ -116,6 +135,7 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument { &mut paragraph, paragraph_start, line_no.saturating_sub(1), + &heading_stack, ); continue; } @@ -133,14 +153,22 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument { text: code.join("\n"), start_line: code_start, end_line: final_line, + heading_path: current_heading_path(&heading_stack), }); } - flush_paragraph(&mut blocks, &mut paragraph, paragraph_start, final_line); + flush_paragraph( + &mut blocks, + &mut paragraph, + paragraph_start, + final_line, + &heading_stack, + ); MarkdownDocument { headings, blocks, wikilinks: extract_wikilinks(source_path, content), + tags: extract_tags(content), } } @@ -149,6 +177,7 @@ fn flush_paragraph( paragraph: &mut Vec, start_line: usize, end_line: usize, + heading_stack: &[(usize, String)], ) { if paragraph.is_empty() { return; @@ -159,19 +188,27 @@ fn flush_paragraph( text: paragraph.join(" "), start_line, end_line, + heading_path: current_heading_path(heading_stack), }); paragraph.clear(); } -fn parse_heading(trimmed: &str) -> Option { +fn parse_heading(trimmed: &str) -> Option<(usize, String)> { let hashes = trimmed.chars().take_while(|c| *c == '#').count(); if (1..=6).contains(&hashes) && trimmed.chars().nth(hashes) == Some(' ') { - Some(trimmed[hashes + 1..].trim().to_string()) + Some((hashes, trimmed[hashes + 1..].trim().to_string())) } else { None } } +fn current_heading_path(heading_stack: &[(usize, String)]) -> Vec { + heading_stack + .iter() + .map(|(_, heading)| heading.clone()) + .collect() +} + fn is_list_item(trimmed: &str) -> bool { trimmed.starts_with("- ") || trimmed.starts_with("* ") @@ -203,9 +240,78 @@ pub fn extract_wikilinks(source_path: &str, content: &str) -> Vec { .collect() } +pub fn extract_tags(content: &str) -> Vec { + let mut tags = BTreeSet::new(); + // Frontmatter and inline tags meet here, then we normalize once. + for tag in extract_frontmatter_tags(content) + .into_iter() + .chain(extract_inline_tags(content)) + { + let normalized = normalize_tag(&tag); + if !normalized.is_empty() { + tags.insert(normalized); + } + } + tags.into_iter().collect() +} + +fn extract_frontmatter_tags(content: &str) -> Vec { + let mut tags = Vec::new(); + let mut lines = content.lines(); + if lines.next() != Some("---") { + return tags; + } + + let mut in_tags_list = false; + for line in lines { + let trimmed = line.trim(); + if trimmed == "---" { + break; + } + + if let Some(value) = trimmed.strip_prefix("tags:") { + in_tags_list = true; + tags.extend(split_tag_values(value)); + continue; + } + + if in_tags_list && trimmed.starts_with('-') { + tags.push(trimmed.trim_start_matches('-').trim().to_string()); + continue; + } + + if !trimmed.is_empty() && !trimmed.starts_with('#') { + in_tags_list = false; + } + } + + tags +} + +fn extract_inline_tags(content: &str) -> Vec { + let tag_re = Regex::new(r"(?m)(^|[\s(\[{])#([A-Za-z0-9_/-]+)").expect("valid tag regex"); + tag_re + .captures_iter(content) + .filter_map(|capture| capture.get(2).map(|tag| tag.as_str().to_string())) + .collect() +} + +fn split_tag_values(value: &str) -> Vec { + let value = value.trim().trim_start_matches('[').trim_end_matches(']'); + value + .split(',') + .map(|tag| tag.trim().trim_matches('"').trim_matches('\'').to_string()) + .filter(|tag| !tag.is_empty()) + .collect() +} + +fn normalize_tag(tag: &str) -> String { + tag.trim().trim_start_matches('#').trim().to_lowercase() +} + #[cfg(test)] mod tests { - use super::{MarkdownBlockKind, extract_wikilinks, parse_markdown}; + use super::{MarkdownBlockKind, extract_tags, extract_wikilinks, parse_markdown}; #[test] fn extracts_obsidian_wikilink_forms() { @@ -251,4 +357,13 @@ mod tests { .any(|block| matches!(block.kind, MarkdownBlockKind::CodeBlock)) ); } + + #[test] + fn extracts_and_normalizes_tags() { + let tags = extract_tags( + "---\ntags: [Rust, glassmind]\n---\nBody #Rust #local-first\n# Heading is not a tag\n", + ); + + assert_eq!(tags, vec!["glassmind", "local-first", "rust"]); + } } diff --git a/src/vault.rs b/src/vault.rs index e1f5cf8..63feeb2 100644 --- a/src/vault.rs +++ b/src/vault.rs @@ -9,6 +9,7 @@ use tracing::{debug, warn}; use walkdir::{DirEntry, WalkDir}; use crate::config::Config; +use crate::db::sha256_hex; use crate::markdown::{MarkdownBlock, Wikilink, parse_markdown}; #[derive(Clone, Debug, Serialize)] @@ -26,9 +27,11 @@ pub struct NoteMetadata { pub title: String, pub modified_unix_secs: Option, pub file_size: u64, + pub content_hash: String, pub headings: Vec, pub blocks: Vec, pub wikilinks: Vec, + pub tags: Vec, } #[derive(Clone, Debug, Serialize)] @@ -39,7 +42,19 @@ pub struct IndexSummary { pub headings: usize, pub blocks: usize, pub wikilinks: usize, + pub tags: usize, pub skipped_dirs: Vec, + pub writes: Option, +} + +#[derive(Clone, Debug, Default, Serialize)] +pub struct IndexWriteSummary { + pub notes_seen: usize, + pub changed_notes: usize, + pub unchanged_notes: usize, + pub chunks_written: usize, + pub tags_seen: usize, + pub links_written: usize, } #[derive(Clone, Debug, Serialize)] @@ -115,7 +130,16 @@ impl VaultIndex { headings: self.notes.iter().map(|note| note.headings.len()).sum(), blocks: self.notes.iter().map(|note| note.blocks.len()).sum(), wikilinks: self.notes.iter().map(|note| note.wikilinks.len()).sum(), + tags: self.notes.iter().map(|note| note.tags.len()).sum(), skipped_dirs: self.skipped_dirs.clone(), + writes: None, + } + } + + pub fn summary_with_writes(&self, writes: IndexWriteSummary) -> IndexSummary { + IndexSummary { + writes: Some(writes), + ..self.summary() } } @@ -213,7 +237,14 @@ impl fmt::Display for IndexSummary { writeln!(f, "Headings parsed: {}", self.headings)?; writeln!(f, "Markdown blocks: {}", self.blocks)?; writeln!(f, "Wikilinks: {}", self.wikilinks)?; - writeln!(f, "Skipped dirs: {}", self.skipped_dirs.len()) + writeln!(f, "Tags: {}", self.tags)?; + writeln!(f, "Skipped dirs: {}", self.skipped_dirs.len())?; + if let Some(writes) = &self.writes { + writeln!(f, "Changed notes: {}", writes.changed_notes)?; + writeln!(f, "Unchanged notes skipped: {}", writes.unchanged_notes)?; + writeln!(f, "Chunks written: {}", writes.chunks_written)?; + } + Ok(()) } } @@ -225,6 +256,7 @@ fn read_note(path: &Path, vault_path: &Path) -> Result { let relative_path = path.strip_prefix(vault_path).unwrap_or(path).to_path_buf(); let source_path = relative_path.to_string_lossy().replace('\\', "/"); let parsed = parse_markdown(&source_path, &content); + let content_hash = sha256_hex(&content); Ok(NoteMetadata { path: relative_path, @@ -240,9 +272,11 @@ fn read_note(path: &Path, vault_path: &Path) -> Result { .and_then(|modified| modified.duration_since(UNIX_EPOCH).ok()) .map(|duration| duration.as_secs()), file_size: metadata.len(), + content_hash, headings: parsed.headings, blocks: parsed.blocks, wikilinks: parsed.wikilinks, + tags: parsed.tags, }) }