mirror of
https://github.com/khodges42/glassMind.git
synced 2026-06-14 18:18:36 +00:00
next chunk of the boring but important indexing layer
This commit is contained in:
parent
fc9b2efd0b
commit
9fb82b5324
176
Cargo.lock
generated
176
Cargo.lock
generated
|
|
@ -73,6 +73,25 @@ version = "2.11.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3"
|
||||
|
||||
[[package]]
|
||||
name = "block-buffer"
|
||||
version = "0.10.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.62"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1dce859f0832a7d088c4f1119888ab94ef4b5d6795d1ce05afb7fe159d79f98"
|
||||
dependencies = [
|
||||
"find-msvc-tools",
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.4"
|
||||
|
|
@ -125,12 +144,75 @@ version = "1.0.5"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570"
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crypto-common"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "digest"
|
||||
version = "0.10.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
||||
dependencies = [
|
||||
"block-buffer",
|
||||
"crypto-common",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
|
||||
|
||||
[[package]]
|
||||
name = "fallible-iterator"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2acce4a10f12dc2fb14a218589d4f1f62ef011b2d0cc4b3cb1bba8e94da14649"
|
||||
|
||||
[[package]]
|
||||
name = "fallible-streaming-iterator"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
|
||||
|
||||
[[package]]
|
||||
name = "find-msvc-tools"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582"
|
||||
|
||||
[[package]]
|
||||
name = "foldhash"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
|
||||
dependencies = [
|
||||
"typenum",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getopts"
|
||||
version = "0.2.24"
|
||||
|
|
@ -148,20 +230,40 @@ dependencies = [
|
|||
"clap",
|
||||
"pulldown-cmark",
|
||||
"regex",
|
||||
"rusqlite",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"sha2",
|
||||
"toml",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1"
|
||||
dependencies = [
|
||||
"foldhash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.17.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a"
|
||||
|
||||
[[package]]
|
||||
name = "hashlink"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1"
|
||||
dependencies = [
|
||||
"hashbrown 0.15.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "heck"
|
||||
version = "0.5.0"
|
||||
|
|
@ -175,7 +277,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown",
|
||||
"hashbrown 0.17.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -196,6 +298,23 @@ version = "1.5.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.186"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66"
|
||||
|
||||
[[package]]
|
||||
name = "libsqlite3-sys"
|
||||
version = "0.35.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "133c182a6a2c87864fe97778797e46c7e999672690dc9fa3ee8e241aa4a9c13f"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.29"
|
||||
|
|
@ -244,6 +363,12 @@ version = "0.2.17"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd"
|
||||
|
||||
[[package]]
|
||||
name = "pkg-config"
|
||||
version = "0.3.33"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19f132c84eca552bf34cab8ec81f1c1dcc229b811638f9d283dceabe58c5569e"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.106"
|
||||
|
|
@ -310,6 +435,20 @@ version = "0.8.10"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a"
|
||||
|
||||
[[package]]
|
||||
name = "rusqlite"
|
||||
version = "0.37.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "165ca6e57b20e1351573e3729b958bc62f0e48025386970b6e4d29e7a7e71f3f"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"fallible-iterator",
|
||||
"fallible-streaming-iterator",
|
||||
"hashlink",
|
||||
"libsqlite3-sys",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "same-file"
|
||||
version = "1.0.6"
|
||||
|
|
@ -371,6 +510,17 @@ dependencies = [
|
|||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha2"
|
||||
version = "0.10.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures",
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sharded-slab"
|
||||
version = "0.1.7"
|
||||
|
|
@ -380,6 +530,12 @@ dependencies = [
|
|||
"lazy_static",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.15.1"
|
||||
|
|
@ -512,6 +668,12 @@ dependencies = [
|
|||
"tracing-log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "40ce102ab67701b8526c123c1bab5cbe42d7040ccfd0f64af1a385808d2f43de"
|
||||
|
||||
[[package]]
|
||||
name = "unicase"
|
||||
version = "2.9.0"
|
||||
|
|
@ -542,6 +704,18 @@ version = "0.1.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
|
||||
|
||||
[[package]]
|
||||
name = "vcpkg"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "walkdir"
|
||||
version = "2.5.0"
|
||||
|
|
|
|||
|
|
@ -8,8 +8,10 @@ anyhow = "1.0"
|
|||
clap = { version = "4.5", features = ["derive"] }
|
||||
pulldown-cmark = "0.13"
|
||||
regex = "1.11"
|
||||
rusqlite = { version = "0.37", features = ["bundled"] }
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
sha2 = "0.10"
|
||||
toml = "0.9"
|
||||
tracing = "0.1"
|
||||
tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
|
||||
|
|
|
|||
|
|
@ -232,7 +232,7 @@ Store:
|
|||
|
||||
---
|
||||
|
||||
## [ ] GM-011 — Extract tags
|
||||
## [x] GM-011 — Extract tags
|
||||
|
||||
### Goals
|
||||
Parse tags from notes.
|
||||
|
|
@ -254,7 +254,7 @@ Normalize:
|
|||
|
||||
# Phase 3 — Database Layer
|
||||
|
||||
## [ ] GM-012 — Add SQLite integration
|
||||
## [x] GM-012 — Add SQLite integration
|
||||
|
||||
### Goals
|
||||
Create local metadata database.
|
||||
|
|
@ -271,7 +271,7 @@ Create local metadata database.
|
|||
|
||||
---
|
||||
|
||||
## [ ] GM-013 — Create notes table
|
||||
## [x] GM-013 — Create notes table
|
||||
|
||||
### Goals
|
||||
Store note metadata.
|
||||
|
|
@ -289,7 +289,7 @@ Create schema for:
|
|||
|
||||
---
|
||||
|
||||
## [ ] GM-014 — Create chunks table
|
||||
## [x] GM-014 — Create chunks table
|
||||
|
||||
### Goals
|
||||
Store retrieval chunks.
|
||||
|
|
@ -308,7 +308,7 @@ Store:
|
|||
|
||||
---
|
||||
|
||||
## [ ] GM-015 — Add content hashing
|
||||
## [x] GM-015 — Add content hashing
|
||||
|
||||
### Goals
|
||||
Detect changed notes efficiently.
|
||||
|
|
|
|||
|
|
@ -1,6 +1,9 @@
|
|||
[vault]
|
||||
path = "."
|
||||
|
||||
[database]
|
||||
path = ".agent/cache/glassmind.sqlite3"
|
||||
|
||||
[index]
|
||||
include_agent_dir = true
|
||||
ignore_dirs = [
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize};
|
|||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct Config {
|
||||
pub vault: VaultConfig,
|
||||
pub database: DatabaseConfig,
|
||||
pub index: IndexConfig,
|
||||
pub embeddings: EmbeddingsConfig,
|
||||
pub search: SearchConfig,
|
||||
|
|
@ -19,6 +20,11 @@ pub struct VaultConfig {
|
|||
pub path: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct DatabaseConfig {
|
||||
pub path: PathBuf,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||
pub struct IndexConfig {
|
||||
pub include_agent_dir: bool,
|
||||
|
|
@ -93,6 +99,9 @@ impl Config {
|
|||
if self.server.port == 0 {
|
||||
bail!("server.port must be greater than zero");
|
||||
}
|
||||
if self.database.path.as_os_str().is_empty() {
|
||||
bail!("database.path must not be empty");
|
||||
}
|
||||
match self.writes.mode.as_str() {
|
||||
"off" | "agent-only" | "propose" | "allow" => {}
|
||||
other => {
|
||||
|
|
@ -138,6 +147,9 @@ impl Default for Config {
|
|||
vault: VaultConfig {
|
||||
path: PathBuf::from("."),
|
||||
},
|
||||
database: DatabaseConfig {
|
||||
path: PathBuf::from(".agent/cache/glassmind.sqlite3"),
|
||||
},
|
||||
index: IndexConfig {
|
||||
include_agent_dir: true,
|
||||
ignore_dirs: vec![
|
||||
|
|
|
|||
277
src/db.rs
Normal file
277
src/db.rs
Normal file
|
|
@ -0,0 +1,277 @@
|
|||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use rusqlite::{Connection, OptionalExtension, params};
|
||||
use sha2::{Digest, Sha256};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::markdown::MarkdownBlockKind;
|
||||
use crate::vault::{IndexWriteSummary, NoteMetadata, VaultIndex};
|
||||
|
||||
pub struct IndexStore {
|
||||
conn: Connection,
|
||||
}
|
||||
|
||||
impl IndexStore {
|
||||
pub fn open(path: &Path) -> Result<Self> {
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)
|
||||
.with_context(|| format!("failed to create db dir {}", parent.display()))?;
|
||||
}
|
||||
|
||||
let conn = Connection::open(path)
|
||||
.with_context(|| format!("failed to open sqlite db {}", path.display()))?;
|
||||
let store = Self { conn };
|
||||
store.bootstrap()?;
|
||||
Ok(store)
|
||||
}
|
||||
|
||||
pub fn write_index(&mut self, index: &VaultIndex) -> Result<IndexWriteSummary> {
|
||||
let tx = self.conn.transaction()?;
|
||||
let mut summary = IndexWriteSummary::default();
|
||||
|
||||
// This is a rebuildable cache, so changed notes get their child rows replaced in place.
|
||||
for note in &index.notes {
|
||||
summary.notes_seen += 1;
|
||||
let existing_hash = existing_note_hash(&tx, ¬e.path)?;
|
||||
if existing_hash.as_deref() == Some(note.content_hash.as_str()) {
|
||||
summary.unchanged_notes += 1;
|
||||
debug!(path = %note.path.display(), "skipping unchanged note");
|
||||
continue;
|
||||
}
|
||||
|
||||
summary.changed_notes += 1;
|
||||
let note_id = upsert_note(&tx, note)?;
|
||||
clear_note_children(&tx, note_id)?;
|
||||
insert_chunks(&tx, note_id, note, &mut summary)?;
|
||||
insert_tags(&tx, note_id, note, &mut summary)?;
|
||||
insert_links(&tx, note_id, note, &mut summary)?;
|
||||
}
|
||||
|
||||
tx.commit()?;
|
||||
Ok(summary)
|
||||
}
|
||||
|
||||
fn bootstrap(&self) -> Result<()> {
|
||||
self.conn.execute_batch(
|
||||
r#"
|
||||
PRAGMA foreign_keys = ON;
|
||||
|
||||
CREATE TABLE IF NOT EXISTS migrations (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
applied_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS notes (
|
||||
id INTEGER PRIMARY KEY,
|
||||
path TEXT NOT NULL UNIQUE,
|
||||
filename TEXT NOT NULL,
|
||||
title TEXT NOT NULL,
|
||||
modified_unix_secs INTEGER,
|
||||
file_size INTEGER NOT NULL,
|
||||
content_hash TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS chunks (
|
||||
id INTEGER PRIMARY KEY,
|
||||
note_id INTEGER NOT NULL,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
heading_path TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
chunk_type TEXT NOT NULL,
|
||||
start_line INTEGER NOT NULL,
|
||||
end_line INTEGER NOT NULL,
|
||||
token_estimate INTEGER NOT NULL,
|
||||
content_hash TEXT NOT NULL,
|
||||
FOREIGN KEY(note_id) REFERENCES notes(id) ON DELETE CASCADE,
|
||||
UNIQUE(note_id, chunk_index)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS tags (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL UNIQUE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS note_tags (
|
||||
note_id INTEGER NOT NULL,
|
||||
tag_id INTEGER NOT NULL,
|
||||
FOREIGN KEY(note_id) REFERENCES notes(id) ON DELETE CASCADE,
|
||||
FOREIGN KEY(tag_id) REFERENCES tags(id) ON DELETE CASCADE,
|
||||
PRIMARY KEY(note_id, tag_id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS links (
|
||||
id INTEGER PRIMARY KEY,
|
||||
source_note_id INTEGER NOT NULL,
|
||||
target TEXT NOT NULL,
|
||||
alias TEXT,
|
||||
link_type TEXT NOT NULL DEFAULT 'wikilink',
|
||||
FOREIGN KEY(source_note_id) REFERENCES notes(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
INSERT OR IGNORE INTO migrations (id, name) VALUES (1, 'initial_metadata_index');
|
||||
"#,
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn existing_note_hash(conn: &Connection, path: &Path) -> Result<Option<String>> {
|
||||
conn.query_row(
|
||||
"SELECT content_hash FROM notes WHERE path = ?1",
|
||||
[path_to_db(path)],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.optional()
|
||||
.context("failed to read existing note hash")
|
||||
}
|
||||
|
||||
fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
|
||||
conn.execute(
|
||||
r#"
|
||||
INSERT INTO notes (
|
||||
path,
|
||||
filename,
|
||||
title,
|
||||
modified_unix_secs,
|
||||
file_size,
|
||||
content_hash,
|
||||
updated_at
|
||||
)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT(path) DO UPDATE SET
|
||||
filename = excluded.filename,
|
||||
title = excluded.title,
|
||||
modified_unix_secs = excluded.modified_unix_secs,
|
||||
file_size = excluded.file_size,
|
||||
content_hash = excluded.content_hash,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
"#,
|
||||
params![
|
||||
path_to_db(¬e.path),
|
||||
note.filename,
|
||||
note.title,
|
||||
note.modified_unix_secs,
|
||||
note.file_size,
|
||||
note.content_hash,
|
||||
],
|
||||
)?;
|
||||
|
||||
conn.query_row(
|
||||
"SELECT id FROM notes WHERE path = ?1",
|
||||
[path_to_db(¬e.path)],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.context("failed to read upserted note id")
|
||||
}
|
||||
|
||||
fn clear_note_children(conn: &Connection, note_id: i64) -> Result<()> {
|
||||
conn.execute("DELETE FROM chunks WHERE note_id = ?1", [note_id])?;
|
||||
conn.execute("DELETE FROM note_tags WHERE note_id = ?1", [note_id])?;
|
||||
conn.execute("DELETE FROM links WHERE source_note_id = ?1", [note_id])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_chunks(
|
||||
conn: &Connection,
|
||||
note_id: i64,
|
||||
note: &NoteMetadata,
|
||||
summary: &mut IndexWriteSummary,
|
||||
) -> Result<()> {
|
||||
for (idx, block) in note.blocks.iter().enumerate() {
|
||||
if block.text.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
conn.execute(
|
||||
r#"
|
||||
INSERT INTO chunks (
|
||||
note_id,
|
||||
chunk_index,
|
||||
heading_path,
|
||||
content,
|
||||
chunk_type,
|
||||
start_line,
|
||||
end_line,
|
||||
token_estimate,
|
||||
content_hash
|
||||
)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
|
||||
"#,
|
||||
params![
|
||||
note_id,
|
||||
idx as i64,
|
||||
block.heading_path.join(" > "),
|
||||
block.text,
|
||||
chunk_type(&block.kind),
|
||||
block.start_line as i64,
|
||||
block.end_line as i64,
|
||||
estimate_tokens(&block.text) as i64,
|
||||
sha256_hex(&block.text),
|
||||
],
|
||||
)?;
|
||||
summary.chunks_written += 1;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_tags(
|
||||
conn: &Connection,
|
||||
note_id: i64,
|
||||
note: &NoteMetadata,
|
||||
summary: &mut IndexWriteSummary,
|
||||
) -> Result<()> {
|
||||
for tag in ¬e.tags {
|
||||
conn.execute("INSERT OR IGNORE INTO tags (name) VALUES (?1)", [tag])?;
|
||||
let tag_id: i64 = conn.query_row("SELECT id FROM tags WHERE name = ?1", [tag], |row| {
|
||||
row.get(0)
|
||||
})?;
|
||||
conn.execute(
|
||||
"INSERT OR IGNORE INTO note_tags (note_id, tag_id) VALUES (?1, ?2)",
|
||||
params![note_id, tag_id],
|
||||
)?;
|
||||
summary.tags_seen += 1;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_links(
|
||||
conn: &Connection,
|
||||
note_id: i64,
|
||||
note: &NoteMetadata,
|
||||
summary: &mut IndexWriteSummary,
|
||||
) -> Result<()> {
|
||||
for link in ¬e.wikilinks {
|
||||
conn.execute(
|
||||
"INSERT INTO links (source_note_id, target, alias, link_type) VALUES (?1, ?2, ?3, 'wikilink')",
|
||||
params![note_id, link.target, link.alias],
|
||||
)?;
|
||||
summary.links_written += 1;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn sha256_hex(content: &str) -> String {
|
||||
format!("{:x}", Sha256::digest(content.as_bytes()))
|
||||
}
|
||||
|
||||
fn estimate_tokens(content: &str) -> usize {
|
||||
content.split_whitespace().count().max(1)
|
||||
}
|
||||
|
||||
fn chunk_type(kind: &MarkdownBlockKind) -> &'static str {
|
||||
match kind {
|
||||
MarkdownBlockKind::Heading => "heading",
|
||||
MarkdownBlockKind::Paragraph => "paragraph",
|
||||
MarkdownBlockKind::CodeBlock => "code_block",
|
||||
MarkdownBlockKind::List => "list",
|
||||
}
|
||||
}
|
||||
|
||||
fn path_to_db(path: &Path) -> String {
|
||||
PathBuf::from(path).to_string_lossy().replace('\\', "/")
|
||||
}
|
||||
12
src/main.rs
12
src/main.rs
|
|
@ -1,5 +1,6 @@
|
|||
mod cli;
|
||||
mod config;
|
||||
mod db;
|
||||
mod logging;
|
||||
mod markdown;
|
||||
mod vault;
|
||||
|
|
@ -10,6 +11,7 @@ use tracing::{debug, info};
|
|||
|
||||
use crate::cli::{Cli, Commands, OutputFormat};
|
||||
use crate::config::Config;
|
||||
use crate::db::IndexStore;
|
||||
use crate::vault::VaultIndex;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
|
|
@ -25,10 +27,16 @@ fn main() -> Result<()> {
|
|||
Commands::Init { force } => init_project(&config, force),
|
||||
Commands::Index { json } => {
|
||||
let index = VaultIndex::scan(&config)?;
|
||||
config.create_agent_dirs()?;
|
||||
// Indexing writes the rebuildable cache, while search can still scan live markdown.
|
||||
let db_path = config.vault.path.join(&config.database.path);
|
||||
let mut store = IndexStore::open(&db_path)?;
|
||||
let writes = store.write_index(&index)?;
|
||||
let summary = index.summary_with_writes(writes);
|
||||
if json {
|
||||
println!("{}", serde_json::to_string_pretty(&index.summary())?);
|
||||
println!("{}", serde_json::to_string_pretty(&summary)?);
|
||||
} else {
|
||||
println!("{}", index.summary());
|
||||
println!("{summary}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
125
src/markdown.rs
125
src/markdown.rs
|
|
@ -1,3 +1,5 @@
|
|||
use std::collections::BTreeSet;
|
||||
|
||||
use regex::Regex;
|
||||
use serde::Serialize;
|
||||
|
||||
|
|
@ -6,6 +8,7 @@ pub struct MarkdownDocument {
|
|||
pub headings: Vec<String>,
|
||||
pub blocks: Vec<MarkdownBlock>,
|
||||
pub wikilinks: Vec<Wikilink>,
|
||||
pub tags: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
|
@ -14,6 +17,7 @@ pub struct MarkdownBlock {
|
|||
pub text: String,
|
||||
pub start_line: usize,
|
||||
pub end_line: usize,
|
||||
pub heading_path: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
|
@ -42,11 +46,13 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
|
|||
let mut in_code = false;
|
||||
let mut code = Vec::new();
|
||||
let mut code_start = 0;
|
||||
let mut heading_stack: Vec<(usize, String)> = Vec::new();
|
||||
|
||||
for (idx, line) in content.lines().enumerate() {
|
||||
let line_no = idx + 1;
|
||||
let trimmed = line.trim();
|
||||
|
||||
// Code fences get kept whole so later chunks stay readable.
|
||||
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
|
||||
if in_code {
|
||||
code.push(line.to_string());
|
||||
|
|
@ -55,6 +61,7 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
|
|||
text: code.join("\n"),
|
||||
start_line: code_start,
|
||||
end_line: line_no,
|
||||
heading_path: current_heading_path(&heading_stack),
|
||||
});
|
||||
code.clear();
|
||||
in_code = false;
|
||||
|
|
@ -64,6 +71,7 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
|
|||
&mut paragraph,
|
||||
paragraph_start,
|
||||
line_no.saturating_sub(1),
|
||||
&heading_stack,
|
||||
);
|
||||
in_code = true;
|
||||
code_start = line_no;
|
||||
|
|
@ -77,19 +85,28 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
|
|||
continue;
|
||||
}
|
||||
|
||||
if let Some(heading) = parse_heading(trimmed) {
|
||||
if let Some((level, heading)) = parse_heading(trimmed) {
|
||||
flush_paragraph(
|
||||
&mut blocks,
|
||||
&mut paragraph,
|
||||
paragraph_start,
|
||||
line_no.saturating_sub(1),
|
||||
&heading_stack,
|
||||
);
|
||||
while heading_stack
|
||||
.last()
|
||||
.is_some_and(|(last_level, _)| *last_level >= level)
|
||||
{
|
||||
heading_stack.pop();
|
||||
}
|
||||
heading_stack.push((level, heading.clone()));
|
||||
headings.push(heading.clone());
|
||||
blocks.push(MarkdownBlock {
|
||||
kind: MarkdownBlockKind::Heading,
|
||||
text: heading,
|
||||
start_line: line_no,
|
||||
end_line: line_no,
|
||||
heading_path: current_heading_path(&heading_stack),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
|
@ -100,12 +117,14 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
|
|||
&mut paragraph,
|
||||
paragraph_start,
|
||||
line_no.saturating_sub(1),
|
||||
&heading_stack,
|
||||
);
|
||||
blocks.push(MarkdownBlock {
|
||||
kind: MarkdownBlockKind::List,
|
||||
text: trimmed.to_string(),
|
||||
start_line: line_no,
|
||||
end_line: line_no,
|
||||
heading_path: current_heading_path(&heading_stack),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
|
@ -116,6 +135,7 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
|
|||
&mut paragraph,
|
||||
paragraph_start,
|
||||
line_no.saturating_sub(1),
|
||||
&heading_stack,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
|
@ -133,14 +153,22 @@ pub fn parse_markdown(source_path: &str, content: &str) -> MarkdownDocument {
|
|||
text: code.join("\n"),
|
||||
start_line: code_start,
|
||||
end_line: final_line,
|
||||
heading_path: current_heading_path(&heading_stack),
|
||||
});
|
||||
}
|
||||
flush_paragraph(&mut blocks, &mut paragraph, paragraph_start, final_line);
|
||||
flush_paragraph(
|
||||
&mut blocks,
|
||||
&mut paragraph,
|
||||
paragraph_start,
|
||||
final_line,
|
||||
&heading_stack,
|
||||
);
|
||||
|
||||
MarkdownDocument {
|
||||
headings,
|
||||
blocks,
|
||||
wikilinks: extract_wikilinks(source_path, content),
|
||||
tags: extract_tags(content),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -149,6 +177,7 @@ fn flush_paragraph(
|
|||
paragraph: &mut Vec<String>,
|
||||
start_line: usize,
|
||||
end_line: usize,
|
||||
heading_stack: &[(usize, String)],
|
||||
) {
|
||||
if paragraph.is_empty() {
|
||||
return;
|
||||
|
|
@ -159,19 +188,27 @@ fn flush_paragraph(
|
|||
text: paragraph.join(" "),
|
||||
start_line,
|
||||
end_line,
|
||||
heading_path: current_heading_path(heading_stack),
|
||||
});
|
||||
paragraph.clear();
|
||||
}
|
||||
|
||||
fn parse_heading(trimmed: &str) -> Option<String> {
|
||||
fn parse_heading(trimmed: &str) -> Option<(usize, String)> {
|
||||
let hashes = trimmed.chars().take_while(|c| *c == '#').count();
|
||||
if (1..=6).contains(&hashes) && trimmed.chars().nth(hashes) == Some(' ') {
|
||||
Some(trimmed[hashes + 1..].trim().to_string())
|
||||
Some((hashes, trimmed[hashes + 1..].trim().to_string()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn current_heading_path(heading_stack: &[(usize, String)]) -> Vec<String> {
|
||||
heading_stack
|
||||
.iter()
|
||||
.map(|(_, heading)| heading.clone())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn is_list_item(trimmed: &str) -> bool {
|
||||
trimmed.starts_with("- ")
|
||||
|| trimmed.starts_with("* ")
|
||||
|
|
@ -203,9 +240,78 @@ pub fn extract_wikilinks(source_path: &str, content: &str) -> Vec<Wikilink> {
|
|||
.collect()
|
||||
}
|
||||
|
||||
pub fn extract_tags(content: &str) -> Vec<String> {
|
||||
let mut tags = BTreeSet::new();
|
||||
// Frontmatter and inline tags meet here, then we normalize once.
|
||||
for tag in extract_frontmatter_tags(content)
|
||||
.into_iter()
|
||||
.chain(extract_inline_tags(content))
|
||||
{
|
||||
let normalized = normalize_tag(&tag);
|
||||
if !normalized.is_empty() {
|
||||
tags.insert(normalized);
|
||||
}
|
||||
}
|
||||
tags.into_iter().collect()
|
||||
}
|
||||
|
||||
fn extract_frontmatter_tags(content: &str) -> Vec<String> {
|
||||
let mut tags = Vec::new();
|
||||
let mut lines = content.lines();
|
||||
if lines.next() != Some("---") {
|
||||
return tags;
|
||||
}
|
||||
|
||||
let mut in_tags_list = false;
|
||||
for line in lines {
|
||||
let trimmed = line.trim();
|
||||
if trimmed == "---" {
|
||||
break;
|
||||
}
|
||||
|
||||
if let Some(value) = trimmed.strip_prefix("tags:") {
|
||||
in_tags_list = true;
|
||||
tags.extend(split_tag_values(value));
|
||||
continue;
|
||||
}
|
||||
|
||||
if in_tags_list && trimmed.starts_with('-') {
|
||||
tags.push(trimmed.trim_start_matches('-').trim().to_string());
|
||||
continue;
|
||||
}
|
||||
|
||||
if !trimmed.is_empty() && !trimmed.starts_with('#') {
|
||||
in_tags_list = false;
|
||||
}
|
||||
}
|
||||
|
||||
tags
|
||||
}
|
||||
|
||||
fn extract_inline_tags(content: &str) -> Vec<String> {
|
||||
let tag_re = Regex::new(r"(?m)(^|[\s(\[{])#([A-Za-z0-9_/-]+)").expect("valid tag regex");
|
||||
tag_re
|
||||
.captures_iter(content)
|
||||
.filter_map(|capture| capture.get(2).map(|tag| tag.as_str().to_string()))
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn split_tag_values(value: &str) -> Vec<String> {
|
||||
let value = value.trim().trim_start_matches('[').trim_end_matches(']');
|
||||
value
|
||||
.split(',')
|
||||
.map(|tag| tag.trim().trim_matches('"').trim_matches('\'').to_string())
|
||||
.filter(|tag| !tag.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn normalize_tag(tag: &str) -> String {
|
||||
tag.trim().trim_start_matches('#').trim().to_lowercase()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{MarkdownBlockKind, extract_wikilinks, parse_markdown};
|
||||
use super::{MarkdownBlockKind, extract_tags, extract_wikilinks, parse_markdown};
|
||||
|
||||
#[test]
|
||||
fn extracts_obsidian_wikilink_forms() {
|
||||
|
|
@ -251,4 +357,13 @@ mod tests {
|
|||
.any(|block| matches!(block.kind, MarkdownBlockKind::CodeBlock))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn extracts_and_normalizes_tags() {
|
||||
let tags = extract_tags(
|
||||
"---\ntags: [Rust, glassmind]\n---\nBody #Rust #local-first\n# Heading is not a tag\n",
|
||||
);
|
||||
|
||||
assert_eq!(tags, vec!["glassmind", "local-first", "rust"]);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
36
src/vault.rs
36
src/vault.rs
|
|
@ -9,6 +9,7 @@ use tracing::{debug, warn};
|
|||
use walkdir::{DirEntry, WalkDir};
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::db::sha256_hex;
|
||||
use crate::markdown::{MarkdownBlock, Wikilink, parse_markdown};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
|
@ -26,9 +27,11 @@ pub struct NoteMetadata {
|
|||
pub title: String,
|
||||
pub modified_unix_secs: Option<u64>,
|
||||
pub file_size: u64,
|
||||
pub content_hash: String,
|
||||
pub headings: Vec<String>,
|
||||
pub blocks: Vec<MarkdownBlock>,
|
||||
pub wikilinks: Vec<Wikilink>,
|
||||
pub tags: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
|
@ -39,7 +42,19 @@ pub struct IndexSummary {
|
|||
pub headings: usize,
|
||||
pub blocks: usize,
|
||||
pub wikilinks: usize,
|
||||
pub tags: usize,
|
||||
pub skipped_dirs: Vec<PathBuf>,
|
||||
pub writes: Option<IndexWriteSummary>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Serialize)]
|
||||
pub struct IndexWriteSummary {
|
||||
pub notes_seen: usize,
|
||||
pub changed_notes: usize,
|
||||
pub unchanged_notes: usize,
|
||||
pub chunks_written: usize,
|
||||
pub tags_seen: usize,
|
||||
pub links_written: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
|
|
@ -115,7 +130,16 @@ impl VaultIndex {
|
|||
headings: self.notes.iter().map(|note| note.headings.len()).sum(),
|
||||
blocks: self.notes.iter().map(|note| note.blocks.len()).sum(),
|
||||
wikilinks: self.notes.iter().map(|note| note.wikilinks.len()).sum(),
|
||||
tags: self.notes.iter().map(|note| note.tags.len()).sum(),
|
||||
skipped_dirs: self.skipped_dirs.clone(),
|
||||
writes: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn summary_with_writes(&self, writes: IndexWriteSummary) -> IndexSummary {
|
||||
IndexSummary {
|
||||
writes: Some(writes),
|
||||
..self.summary()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -213,7 +237,14 @@ impl fmt::Display for IndexSummary {
|
|||
writeln!(f, "Headings parsed: {}", self.headings)?;
|
||||
writeln!(f, "Markdown blocks: {}", self.blocks)?;
|
||||
writeln!(f, "Wikilinks: {}", self.wikilinks)?;
|
||||
writeln!(f, "Skipped dirs: {}", self.skipped_dirs.len())
|
||||
writeln!(f, "Tags: {}", self.tags)?;
|
||||
writeln!(f, "Skipped dirs: {}", self.skipped_dirs.len())?;
|
||||
if let Some(writes) = &self.writes {
|
||||
writeln!(f, "Changed notes: {}", writes.changed_notes)?;
|
||||
writeln!(f, "Unchanged notes skipped: {}", writes.unchanged_notes)?;
|
||||
writeln!(f, "Chunks written: {}", writes.chunks_written)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -225,6 +256,7 @@ fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
|
|||
let relative_path = path.strip_prefix(vault_path).unwrap_or(path).to_path_buf();
|
||||
let source_path = relative_path.to_string_lossy().replace('\\', "/");
|
||||
let parsed = parse_markdown(&source_path, &content);
|
||||
let content_hash = sha256_hex(&content);
|
||||
|
||||
Ok(NoteMetadata {
|
||||
path: relative_path,
|
||||
|
|
@ -240,9 +272,11 @@ fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
|
|||
.and_then(|modified| modified.duration_since(UNIX_EPOCH).ok())
|
||||
.map(|duration| duration.as_secs()),
|
||||
file_size: metadata.len(),
|
||||
content_hash,
|
||||
headings: parsed.headings,
|
||||
blocks: parsed.blocks,
|
||||
wikilinks: parsed.wikilinks,
|
||||
tags: parsed.tags,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user