next chunk of the boring but important indexing layer

This commit is contained in:
K. Hodges 2026-05-24 02:47:19 -07:00
parent fc9b2efd0b
commit 9fb82b5324
9 changed files with 639 additions and 14 deletions

176
Cargo.lock generated
View File

@ -73,6 +73,25 @@ version = "2.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
[[package]]
name = "block-buffer"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
dependencies = [
"generic-array",
]
[[package]]
name = "cc"
version = "1.2.62"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98"
dependencies = [
"find-msvc-tools",
"shlex",
]
[[package]] [[package]]
name = "cfg-if" name = "cfg-if"
version = "1.0.4" version = "1.0.4"
@ -125,12 +144,75 @@ version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
[[package]]
name = "cpufeatures"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
dependencies = [
"libc",
]
[[package]]
name = "crypto-common"
version = "0.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
dependencies = [
"generic-array",
"typenum",
]
[[package]]
name = "digest"
version = "0.10.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [
"block-buffer",
"crypto-common",
]
[[package]] [[package]]
name = "equivalent" name = "equivalent"
version = "1.0.2" version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "fallible-iterator"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
[[package]]
name = "fallible-streaming-iterator"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "find-msvc-tools"
version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
[[package]]
name = "foldhash"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "generic-array"
version = "0.14.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
dependencies = [
"typenum",
"version_check",
]
[[package]] [[package]]
name = "getopts" name = "getopts"
version = "0.2.24" version = "0.2.24"
@ -148,20 +230,40 @@ dependencies = [
"clap", "clap",
"pulldown-cmark", "pulldown-cmark",
"regex", "regex",
"rusqlite",
"serde", "serde",
"serde_json", "serde_json",
"sha2",
"toml", "toml",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"walkdir", "walkdir",
] ]
[[package]]
name = "hashbrown"
version = "0.15.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
dependencies = [
"foldhash",
]
[[package]] [[package]]
name = "hashbrown" name = "hashbrown"
version = "0.17.1" version = "0.17.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
[[package]]
name = "hashlink"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1"
dependencies = [
"hashbrown 0.15.5",
]
[[package]] [[package]]
name = "heck" name = "heck"
version = "0.5.0" version = "0.5.0"
@ -175,7 +277,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
dependencies = [ dependencies = [
"equivalent", "equivalent",
"hashbrown", "hashbrown 0.17.1",
] ]
[[package]] [[package]]
@ -196,6 +298,23 @@ version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
[[package]]
name = "libc"
version = "0.2.186"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
[[package]]
name = "libsqlite3-sys"
version = "0.35.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "133c182a6a2c87864fe97778797e46c7e999672690dc9fa3ee8e241aa4a9c13f"
dependencies = [
"cc",
"pkg-config",
"vcpkg",
]
[[package]] [[package]]
name = "log" name = "log"
version = "0.4.29" version = "0.4.29"
@ -244,6 +363,12 @@ version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
[[package]]
name = "pkg-config"
version = "0.3.33"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.106" version = "1.0.106"
@ -310,6 +435,20 @@ version = "0.8.10"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
[[package]]
name = "rusqlite"
version = "0.37.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "165ca6e57b20e1351573e3729b958bc62f0e48025386970b6e4d29e7a7e71f3f"
dependencies = [
"bitflags",
"fallible-iterator",
"fallible-streaming-iterator",
"hashlink",
"libsqlite3-sys",
"smallvec",
]
[[package]] [[package]]
name = "same-file" name = "same-file"
version = "1.0.6" version = "1.0.6"
@ -371,6 +510,17 @@ dependencies = [
"serde_core", "serde_core",
] ]
[[package]]
name = "sha2"
version = "0.10.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]] [[package]]
name = "sharded-slab" name = "sharded-slab"
version = "0.1.7" version = "0.1.7"
@ -380,6 +530,12 @@ dependencies = [
"lazy_static", "lazy_static",
] ]
[[package]]
name = "shlex"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]] [[package]]
name = "smallvec" name = "smallvec"
version = "1.15.1" version = "1.15.1"
@ -512,6 +668,12 @@ dependencies = [
"tracing-log", "tracing-log",
] ]
[[package]]
name = "typenum"
version = "1.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
[[package]] [[package]]
name = "unicase" name = "unicase"
version = "2.9.0" version = "2.9.0"
@ -542,6 +704,18 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
[[package]]
name = "vcpkg"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "version_check"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]] [[package]]
name = "walkdir" name = "walkdir"
version = "2.5.0" version = "2.5.0"

View File

@ -8,8 +8,10 @@ anyhow = "1.0"
clap = { version = "4.5", features = ["derive"] } clap = { version = "4.5", features = ["derive"] }
pulldown-cmark = "0.13" pulldown-cmark = "0.13"
regex = "1.11" regex = "1.11"
rusqlite = { version = "0.37", features = ["bundled"] }
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0" serde_json = "1.0"
sha2 = "0.10"
toml = "0.9" toml = "0.9"
tracing = "0.1" tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] } tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }

View File

@ -232,7 +232,7 @@ Store:
--- ---
## [ ] GM-011 — Extract tags ## [x] GM-011 — Extract tags
### Goals ### Goals
Parse tags from notes. Parse tags from notes.
@ -254,7 +254,7 @@ Normalize:
# Phase 3 — Database Layer # Phase 3 — Database Layer
## [ ] GM-012 — Add SQLite integration ## [x] GM-012 — Add SQLite integration
### Goals ### Goals
Create local metadata database. Create local metadata database.
@ -271,7 +271,7 @@ Create local metadata database.
--- ---
## [ ] GM-013 — Create notes table ## [x] GM-013 — Create notes table
### Goals ### Goals
Store note metadata. Store note metadata.
@ -289,7 +289,7 @@ Create schema for:
--- ---
## [ ] GM-014 — Create chunks table ## [x] GM-014 — Create chunks table
### Goals ### Goals
Store retrieval chunks. Store retrieval chunks.
@ -308,7 +308,7 @@ Store:
--- ---
## [ ] GM-015 — Add content hashing ## [x] GM-015 — Add content hashing
### Goals ### Goals
Detect changed notes efficiently. Detect changed notes efficiently.

View File

@ -1,6 +1,9 @@
[vault] [vault]
path = "." path = "."
[database]
path = ".agent/cache/glassmind.sqlite3"
[index] [index]
include_agent_dir = true include_agent_dir = true
ignore_dirs = [ ignore_dirs = [

View File

@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Deserialize, Serialize)] #[derive(Clone, Debug, Deserialize, Serialize)]
pub struct Config { pub struct Config {
pub vault: VaultConfig, pub vault: VaultConfig,
pub database: DatabaseConfig,
pub index: IndexConfig, pub index: IndexConfig,
pub embeddings: EmbeddingsConfig, pub embeddings: EmbeddingsConfig,
pub search: SearchConfig, pub search: SearchConfig,
@ -19,6 +20,11 @@ pub struct VaultConfig {
pub path: PathBuf, pub path: PathBuf,
} }
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct DatabaseConfig {
pub path: PathBuf,
}
#[derive(Clone, Debug, Deserialize, Serialize)] #[derive(Clone, Debug, Deserialize, Serialize)]
pub struct IndexConfig { pub struct IndexConfig {
pub include_agent_dir: bool, pub include_agent_dir: bool,
@ -93,6 +99,9 @@ impl Config {
if self.server.port == 0 { if self.server.port == 0 {
bail!("server.port must be greater than zero"); bail!("server.port must be greater than zero");
} }
if self.database.path.as_os_str().is_empty() {
bail!("database.path must not be empty");
}
match self.writes.mode.as_str() { match self.writes.mode.as_str() {
"off" | "agent-only" | "propose" | "allow" => {} "off" | "agent-only" | "propose" | "allow" => {}
other => { other => {
@ -138,6 +147,9 @@ impl Default for Config {
vault: VaultConfig { vault: VaultConfig {
path: PathBuf::from("."), path: PathBuf::from("."),
}, },
database: DatabaseConfig {
path: PathBuf::from(".agent/cache/glassmind.sqlite3"),
},
index: IndexConfig { index: IndexConfig {
include_agent_dir: true, include_agent_dir: true,
ignore_dirs: vec![ ignore_dirs: vec![

277
src/db.rs Normal file
View File

@ -0,0 +1,277 @@
use std::fs;
use std::path::{Path, PathBuf};
use anyhow::{Context, Result};
use rusqlite::{Connection, OptionalExtension, params};
use sha2::{Digest, Sha256};
use tracing::debug;
use crate::markdown::MarkdownBlockKind;
use crate::vault::{IndexWriteSummary, NoteMetadata, VaultIndex};
pub struct IndexStore {
conn: Connection,
}
impl IndexStore {
pub fn open(path: &Path) -> Result<Self> {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)
.with_context(|| format!("failed to create db dir {}", parent.display()))?;
}
let conn = Connection::open(path)
.with_context(|| format!("failed to open sqlite db {}", path.display()))?;
let store = Self { conn };
store.bootstrap()?;
Ok(store)
}
pub fn write_index(&mut self, index: &VaultIndex) -> Result<IndexWriteSummary> {
let tx = self.conn.transaction()?;
let mut summary = IndexWriteSummary::default();
// This is a rebuildable cache, so changed notes get their child rows replaced in place.
for note in &index.notes {
summary.notes_seen += 1;
let existing_hash = existing_note_hash(&tx, &note.path)?;
if existing_hash.as_deref() == Some(note.content_hash.as_str()) {
summary.unchanged_notes += 1;
debug!(path = %note.path.display(), "skipping unchanged note");
continue;
}
summary.changed_notes += 1;
let note_id = upsert_note(&tx, note)?;
clear_note_children(&tx, note_id)?;
insert_chunks(&tx, note_id, note, &mut summary)?;
insert_tags(&tx, note_id, note, &mut summary)?;
insert_links(&tx, note_id, note, &mut summary)?;
}
tx.commit()?;
Ok(summary)
}
fn bootstrap(&self) -> Result<()> {
self.conn.execute_batch(
r#"
PRAGMA foreign_keys = ON;
CREATE TABLE IF NOT EXISTS migrations (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
applied_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS notes (
id INTEGER PRIMARY KEY,
path TEXT NOT NULL UNIQUE,
filename TEXT NOT NULL,
title TEXT NOT NULL,
modified_unix_secs INTEGER,
file_size INTEGER NOT NULL,
content_hash TEXT NOT NULL,
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS chunks (
id INTEGER PRIMARY KEY,
note_id INTEGER NOT NULL,
chunk_index INTEGER NOT NULL,
heading_path TEXT NOT NULL,
content TEXT NOT NULL,
chunk_type TEXT NOT NULL,
start_line INTEGER NOT NULL,
end_line INTEGER NOT NULL,
token_estimate INTEGER NOT NULL,
content_hash TEXT NOT NULL,
FOREIGN KEY(note_id) REFERENCES notes(id) ON DELETE CASCADE,
UNIQUE(note_id, chunk_index)
);
CREATE TABLE IF NOT EXISTS tags (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL UNIQUE
);
CREATE TABLE IF NOT EXISTS note_tags (
note_id INTEGER NOT NULL,
tag_id INTEGER NOT NULL,
FOREIGN KEY(note_id) REFERENCES notes(id) ON DELETE CASCADE,
FOREIGN KEY(tag_id) REFERENCES tags(id) ON DELETE CASCADE,
PRIMARY KEY(note_id, tag_id)
);
CREATE TABLE IF NOT EXISTS links (
id INTEGER PRIMARY KEY,
source_note_id INTEGER NOT NULL,
target TEXT NOT NULL,
alias TEXT,
link_type TEXT NOT NULL DEFAULT 'wikilink',
FOREIGN KEY(source_note_id) REFERENCES notes(id) ON DELETE CASCADE
);
INSERT OR IGNORE INTO migrations (id, name) VALUES (1, 'initial_metadata_index');
"#,
)?;
Ok(())
}
}
fn existing_note_hash(conn: &Connection, path: &Path) -> Result<Option<String>> {
conn.query_row(
"SELECT content_hash FROM notes WHERE path = ?1",
[path_to_db(path)],
|row| row.get(0),
)
.optional()
.context("failed to read existing note hash")
}
fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
conn.execute(
r#"
INSERT INTO notes (
path,
filename,
title,
modified_unix_secs,
file_size,
content_hash,
updated_at
)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, CURRENT_TIMESTAMP)
ON CONFLICT(path) DO UPDATE SET
filename = excluded.filename,
title = excluded.title,
modified_unix_secs = excluded.modified_unix_secs,
file_size = excluded.file_size,
content_hash = excluded.content_hash,
updated_at = CURRENT_TIMESTAMP
"#,
params![
path_to_db(&note.path),
note.filename,
note.title,
note.modified_unix_secs,
note.file_size,
note.content_hash,
],
)?;
conn.query_row(
"SELECT id FROM notes WHERE path = ?1",
[path_to_db(&note.path)],
|row| row.get(0),
)
.context("failed to read upserted note id")
}
fn clear_note_children(conn: &Connection, note_id: i64) -> Result<()> {
conn.execute("DELETE FROM chunks WHERE note_id = ?1", [note_id])?;
conn.execute("DELETE FROM note_tags WHERE note_id = ?1", [note_id])?;
conn.execute("DELETE FROM links WHERE source_note_id = ?1", [note_id])?;
Ok(())
}
fn insert_chunks(
conn: &Connection,
note_id: i64,
note: &NoteMetadata,
summary: &mut IndexWriteSummary,
) -> Result<()> {
for (idx, block) in note.blocks.iter().enumerate() {
if block.text.trim().is_empty() {
continue;
}
conn.execute(
r#"
INSERT INTO chunks (
note_id,
chunk_index,
heading_path,
content,
chunk_type,
start_line,
end_line,
token_estimate,
content_hash
)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
"#,
params![
note_id,
idx as i64,
block.heading_path.join(" > "),
block.text,
chunk_type(&block.kind),
block.start_line as i64,
block.end_line as i64,
estimate_tokens(&block.text) as i64,
sha256_hex(&block.text),
],
)?;
summary.chunks_written += 1;
}
Ok(())
}
fn insert_tags(
conn: &Connection,
note_id: i64,
note: &NoteMetadata,
summary: &mut IndexWriteSummary,
) -> Result<()> {
for tag in &note.tags {
conn.execute("INSERT OR IGNORE INTO tags (name) VALUES (?1)", [tag])?;
let tag_id: i64 = conn.query_row("SELECT id FROM tags WHERE name = ?1", [tag], |row| {
row.get(0)
})?;
conn.execute(
"INSERT OR IGNORE INTO note_tags (note_id, tag_id) VALUES (?1, ?2)",
params![note_id, tag_id],
)?;
summary.tags_seen += 1;
}
Ok(())
}
fn insert_links(
conn: &Connection,
note_id: i64,
note: &NoteMetadata,
summary: &mut IndexWriteSummary,
) -> Result<()> {
for link in &note.wikilinks {
conn.execute(
"INSERT INTO links (source_note_id, target, alias, link_type) VALUES (?1, ?2, ?3, 'wikilink')",
params![note_id, link.target, link.alias],
)?;
summary.links_written += 1;
}
Ok(())
}
pub fn sha256_hex(content: &str) -> String {
format!("{:x}", Sha256::digest(content.as_bytes()))
}
fn estimate_tokens(content: &str) -> usize {
content.split_whitespace().count().max(1)
}
fn chunk_type(kind: &MarkdownBlockKind) -> &'static str {
match kind {
MarkdownBlockKind::Heading => "heading",
MarkdownBlockKind::Paragraph => "paragraph",
MarkdownBlockKind::CodeBlock => "code_block",
MarkdownBlockKind::List => "list",
}
}
fn path_to_db(path: &Path) -> String {
PathBuf::from(path).to_string_lossy().replace('\\', "/")
}

View File

@ -1,5 +1,6 @@
mod cli; mod cli;
mod config; mod config;
mod db;
mod logging; mod logging;
mod markdown; mod markdown;
mod vault; mod vault;
@ -10,6 +11,7 @@ use tracing::{debug, info};
use crate::cli::{Cli, Commands, OutputFormat}; use crate::cli::{Cli, Commands, OutputFormat};
use crate::config::Config; use crate::config::Config;
use crate::db::IndexStore;
use crate::vault::VaultIndex; use crate::vault::VaultIndex;
fn main() -> Result<()> { fn main() -> Result<()> {
@ -25,10 +27,16 @@ fn main() -> Result<()> {
Commands::Init { force } => init_project(&config, force), Commands::Init { force } => init_project(&config, force),
Commands::Index { json } => { Commands::Index { json } => {
let index = VaultIndex::scan(&config)?; let index = VaultIndex::scan(&config)?;
config.create_agent_dirs()?;
// Indexing writes the rebuildable cache, while search can still scan live markdown.
let db_path = config.vault.path.join(&config.database.path);
let mut store = IndexStore::open(&db_path)?;
let writes = store.write_index(&index)?;
let summary = index.summary_with_writes(writes);
if json { if json {
println!("{}", serde_json::to_string_pretty(&index.summary())?); println!("{}", serde_json::to_string_pretty(&summary)?);
} else { } else {
println!("{}", index.summary()); println!("{summary}");
} }
Ok(()) Ok(())
} }

View File

@ -1,3 +1,5 @@
use std::collections::BTreeSet;
use regex::Regex; use regex::Regex;
use serde::Serialize; use serde::Serialize;
@ -6,6 +8,7 @@ pub struct MarkdownDocument {
pub headings: Vec<String>, pub headings: Vec<String>,
pub blocks: Vec<MarkdownBlock>, pub blocks: Vec<MarkdownBlock>,
pub wikilinks: Vec<Wikilink>, pub wikilinks: Vec<Wikilink>,
pub tags: Vec<String>,
} }
#[derive(Clone, Debug, Serialize)] #[derive(Clone, Debug, Serialize)]
@ -14,6 +17,7 @@ pub struct MarkdownBlock {
pub text: String, pub text: String,
pub start_line: usize, pub start_line: usize,
pub end_line: usize, pub end_line: usize,
pub heading_path: Vec<String>,
} }
#[derive(Clone, Debug, Serialize)] #[derive(Clone, Debug, Serialize)]
@ -42,11 +46,13 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
let mut in_code = false; let mut in_code = false;
let mut code = Vec::new(); let mut code = Vec::new();
let mut code_start = 0; let mut code_start = 0;
let mut heading_stack: Vec<(usize, String)> = Vec::new();
for (idx, line) in content.lines().enumerate() { for (idx, line) in content.lines().enumerate() {
let line_no = idx + 1; let line_no = idx + 1;
let trimmed = line.trim(); let trimmed = line.trim();
// Code fences get kept whole so later chunks stay readable.
if trimmed.starts_with("```") || trimmed.starts_with("~~~") { if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
if in_code { if in_code {
code.push(line.to_string()); code.push(line.to_string());
@ -55,6 +61,7 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
text: code.join("\n"), text: code.join("\n"),
start_line: code_start, start_line: code_start,
end_line: line_no, end_line: line_no,
heading_path: current_heading_path(&heading_stack),
}); });
code.clear(); code.clear();
in_code = false; in_code = false;
@ -64,6 +71,7 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
&mut paragraph, &mut paragraph,
paragraph_start, paragraph_start,
line_no.saturating_sub(1), line_no.saturating_sub(1),
&heading_stack,
); );
in_code = true; in_code = true;
code_start = line_no; code_start = line_no;
@ -77,19 +85,28 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
continue; continue;
} }
if let Some(heading) = parse_heading(trimmed) { if let Some((level, heading)) = parse_heading(trimmed) {
flush_paragraph( flush_paragraph(
&mut blocks, &mut blocks,
&mut paragraph, &mut paragraph,
paragraph_start, paragraph_start,
line_no.saturating_sub(1), line_no.saturating_sub(1),
&heading_stack,
); );
while heading_stack
.last()
.is_some_and(|(last_level, _)| *last_level >= level)
{
heading_stack.pop();
}
heading_stack.push((level, heading.clone()));
headings.push(heading.clone()); headings.push(heading.clone());
blocks.push(MarkdownBlock { blocks.push(MarkdownBlock {
kind: MarkdownBlockKind::Heading, kind: MarkdownBlockKind::Heading,
text: heading, text: heading,
start_line: line_no, start_line: line_no,
end_line: line_no, end_line: line_no,
heading_path: current_heading_path(&heading_stack),
}); });
continue; continue;
} }
@ -100,12 +117,14 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
&mut paragraph, &mut paragraph,
paragraph_start, paragraph_start,
line_no.saturating_sub(1), line_no.saturating_sub(1),
&heading_stack,
); );
blocks.push(MarkdownBlock { blocks.push(MarkdownBlock {
kind: MarkdownBlockKind::List, kind: MarkdownBlockKind::List,
text: trimmed.to_string(), text: trimmed.to_string(),
start_line: line_no, start_line: line_no,
end_line: line_no, end_line: line_no,
heading_path: current_heading_path(&heading_stack),
}); });
continue; continue;
} }
@ -116,6 +135,7 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
&mut paragraph, &mut paragraph,
paragraph_start, paragraph_start,
line_no.saturating_sub(1), line_no.saturating_sub(1),
&heading_stack,
); );
continue; continue;
} }
@ -133,14 +153,22 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
text: code.join("\n"), text: code.join("\n"),
start_line: code_start, start_line: code_start,
end_line: final_line, end_line: final_line,
heading_path: current_heading_path(&heading_stack),
}); });
} }
flush_paragraph(&mut blocks, &mut paragraph, paragraph_start, final_line); flush_paragraph(
&mut blocks,
&mut paragraph,
paragraph_start,
final_line,
&heading_stack,
);
MarkdownDocument { MarkdownDocument {
headings, headings,
blocks, blocks,
wikilinks: extract_wikilinks(source_path, content), wikilinks: extract_wikilinks(source_path, content),
tags: extract_tags(content),
} }
} }
@ -149,6 +177,7 @@ fn flush_paragraph(
paragraph: &mut Vec<String>, paragraph: &mut Vec<String>,
start_line: usize, start_line: usize,
end_line: usize, end_line: usize,
heading_stack: &[(usize, String)],
) { ) {
if paragraph.is_empty() { if paragraph.is_empty() {
return; return;
@ -159,19 +188,27 @@ fn flush_paragraph(
text: paragraph.join(" "), text: paragraph.join(" "),
start_line, start_line,
end_line, end_line,
heading_path: current_heading_path(heading_stack),
}); });
paragraph.clear(); paragraph.clear();
} }
fn parse_heading(trimmed: &str) -> Option<String> { fn parse_heading(trimmed: &str) -> Option<(usize, String)> {
let hashes = trimmed.chars().take_while(|c| *c == '#').count(); let hashes = trimmed.chars().take_while(|c| *c == '#').count();
if (1..=6).contains(&hashes) && trimmed.chars().nth(hashes) == Some(' ') { if (1..=6).contains(&hashes) && trimmed.chars().nth(hashes) == Some(' ') {
Some(trimmed[hashes + 1..].trim().to_string()) Some((hashes, trimmed[hashes + 1..].trim().to_string()))
} else { } else {
None None
} }
} }
fn current_heading_path(heading_stack: &[(usize, String)]) -> Vec<String> {
heading_stack
.iter()
.map(|(_, heading)| heading.clone())
.collect()
}
fn is_list_item(trimmed: &str) -> bool { fn is_list_item(trimmed: &str) -> bool {
trimmed.starts_with("- ") trimmed.starts_with("- ")
|| trimmed.starts_with("* ") || trimmed.starts_with("* ")
@ -203,9 +240,78 @@ pub fn extract_wikilinks(source_path: &str, content: &str) -> Vec<Wikilink> {
.collect() .collect()
} }
pub fn extract_tags(content: &str) -> Vec<String> {
let mut tags = BTreeSet::new();
// Frontmatter and inline tags meet here, then we normalize once.
for tag in extract_frontmatter_tags(content)
.into_iter()
.chain(extract_inline_tags(content))
{
let normalized = normalize_tag(&tag);
if !normalized.is_empty() {
tags.insert(normalized);
}
}
tags.into_iter().collect()
}
fn extract_frontmatter_tags(content: &str) -> Vec<String> {
let mut tags = Vec::new();
let mut lines = content.lines();
if lines.next() != Some("---") {
return tags;
}
let mut in_tags_list = false;
for line in lines {
let trimmed = line.trim();
if trimmed == "---" {
break;
}
if let Some(value) = trimmed.strip_prefix("tags:") {
in_tags_list = true;
tags.extend(split_tag_values(value));
continue;
}
if in_tags_list && trimmed.starts_with('-') {
tags.push(trimmed.trim_start_matches('-').trim().to_string());
continue;
}
if !trimmed.is_empty() && !trimmed.starts_with('#') {
in_tags_list = false;
}
}
tags
}
fn extract_inline_tags(content: &str) -> Vec<String> {
let tag_re = Regex::new(r"(?m)(^|[\s(\[{])#([A-Za-z0-9_/-]+)").expect("valid tag regex");
tag_re
.captures_iter(content)
.filter_map(|capture| capture.get(2).map(|tag| tag.as_str().to_string()))
.collect()
}
fn split_tag_values(value: &str) -> Vec<String> {
let value = value.trim().trim_start_matches('[').trim_end_matches(']');
value
.split(',')
.map(|tag| tag.trim().trim_matches('"').trim_matches('\'').to_string())
.filter(|tag| !tag.is_empty())
.collect()
}
fn normalize_tag(tag: &str) -> String {
tag.trim().trim_start_matches('#').trim().to_lowercase()
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{MarkdownBlockKind, extract_wikilinks, parse_markdown}; use super::{MarkdownBlockKind, extract_tags, extract_wikilinks, parse_markdown};
#[test] #[test]
fn extracts_obsidian_wikilink_forms() { fn extracts_obsidian_wikilink_forms() {
@ -251,4 +357,13 @@ mod tests {
.any(|block| matches!(block.kind, MarkdownBlockKind::CodeBlock)) .any(|block| matches!(block.kind, MarkdownBlockKind::CodeBlock))
); );
} }
#[test]
fn extracts_and_normalizes_tags() {
let tags = extract_tags(
"---\ntags: [Rust, glassmind]\n---\nBody #Rust #local-first\n# Heading is not a tag\n",
);
assert_eq!(tags, vec!["glassmind", "local-first", "rust"]);
}
} }

View File

@ -9,6 +9,7 @@ use tracing::{debug, warn};
use walkdir::{DirEntry, WalkDir}; use walkdir::{DirEntry, WalkDir};
use crate::config::Config; use crate::config::Config;
use crate::db::sha256_hex;
use crate::markdown::{MarkdownBlock, Wikilink, parse_markdown}; use crate::markdown::{MarkdownBlock, Wikilink, parse_markdown};
#[derive(Clone, Debug, Serialize)] #[derive(Clone, Debug, Serialize)]
@ -26,9 +27,11 @@ pub struct NoteMetadata {
pub title: String, pub title: String,
pub modified_unix_secs: Option<u64>, pub modified_unix_secs: Option<u64>,
pub file_size: u64, pub file_size: u64,
pub content_hash: String,
pub headings: Vec<String>, pub headings: Vec<String>,
pub blocks: Vec<MarkdownBlock>, pub blocks: Vec<MarkdownBlock>,
pub wikilinks: Vec<Wikilink>, pub wikilinks: Vec<Wikilink>,
pub tags: Vec<String>,
} }
#[derive(Clone, Debug, Serialize)] #[derive(Clone, Debug, Serialize)]
@ -39,7 +42,19 @@ pub struct IndexSummary {
pub headings: usize, pub headings: usize,
pub blocks: usize, pub blocks: usize,
pub wikilinks: usize, pub wikilinks: usize,
pub tags: usize,
pub skipped_dirs: Vec<PathBuf>, pub skipped_dirs: Vec<PathBuf>,
pub writes: Option<IndexWriteSummary>,
}
#[derive(Clone, Debug, Default, Serialize)]
pub struct IndexWriteSummary {
pub notes_seen: usize,
pub changed_notes: usize,
pub unchanged_notes: usize,
pub chunks_written: usize,
pub tags_seen: usize,
pub links_written: usize,
} }
#[derive(Clone, Debug, Serialize)] #[derive(Clone, Debug, Serialize)]
@ -115,7 +130,16 @@ impl VaultIndex {
headings: self.notes.iter().map(|note| note.headings.len()).sum(), headings: self.notes.iter().map(|note| note.headings.len()).sum(),
blocks: self.notes.iter().map(|note| note.blocks.len()).sum(), blocks: self.notes.iter().map(|note| note.blocks.len()).sum(),
wikilinks: self.notes.iter().map(|note| note.wikilinks.len()).sum(), wikilinks: self.notes.iter().map(|note| note.wikilinks.len()).sum(),
tags: self.notes.iter().map(|note| note.tags.len()).sum(),
skipped_dirs: self.skipped_dirs.clone(), skipped_dirs: self.skipped_dirs.clone(),
writes: None,
}
}
pub fn summary_with_writes(&self, writes: IndexWriteSummary) -> IndexSummary {
IndexSummary {
writes: Some(writes),
..self.summary()
} }
} }
@ -213,7 +237,14 @@ impl fmt::Display for IndexSummary {
writeln!(f, "Headings parsed: {}", self.headings)?; writeln!(f, "Headings parsed: {}", self.headings)?;
writeln!(f, "Markdown blocks: {}", self.blocks)?; writeln!(f, "Markdown blocks: {}", self.blocks)?;
writeln!(f, "Wikilinks: {}", self.wikilinks)?; writeln!(f, "Wikilinks: {}", self.wikilinks)?;
writeln!(f, "Skipped dirs: {}", self.skipped_dirs.len()) writeln!(f, "Tags: {}", self.tags)?;
writeln!(f, "Skipped dirs: {}", self.skipped_dirs.len())?;
if let Some(writes) = &self.writes {
writeln!(f, "Changed notes: {}", writes.changed_notes)?;
writeln!(f, "Unchanged notes skipped: {}", writes.unchanged_notes)?;
writeln!(f, "Chunks written: {}", writes.chunks_written)?;
}
Ok(())
} }
} }
@ -225,6 +256,7 @@ fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
let relative_path = path.strip_prefix(vault_path).unwrap_or(path).to_path_buf(); let relative_path = path.strip_prefix(vault_path).unwrap_or(path).to_path_buf();
let source_path = relative_path.to_string_lossy().replace('\\', "/"); let source_path = relative_path.to_string_lossy().replace('\\', "/");
let parsed = parse_markdown(&source_path, &content); let parsed = parse_markdown(&source_path, &content);
let content_hash = sha256_hex(&content);
Ok(NoteMetadata { Ok(NoteMetadata {
path: relative_path, path: relative_path,
@ -240,9 +272,11 @@ fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
.and_then(|modified| modified.duration_since(UNIX_EPOCH).ok()) .and_then(|modified| modified.duration_since(UNIX_EPOCH).ok())
.map(|duration| duration.as_secs()), .map(|duration| duration.as_secs()),
file_size: metadata.len(), file_size: metadata.len(),
content_hash,
headings: parsed.headings, headings: parsed.headings,
blocks: parsed.blocks, blocks: parsed.blocks,
wikilinks: parsed.wikilinks, wikilinks: parsed.wikilinks,
tags: parsed.tags,
}) })
} }