mirror of
https://github.com/khodges42/glassMind.git
synced 2026-06-14 18:18:36 +00:00
Add heading chunks and SQLite FTS search
This commit is contained in:
parent
9fb82b5324
commit
15854cc91e
|
|
@ -327,7 +327,7 @@ Detect changed notes efficiently.
|
|||
|
||||
# Phase 4 — Chunking
|
||||
|
||||
## [ ] GM-016 — Implement heading-based chunking
|
||||
## [x] GM-016 — Implement heading-based chunking
|
||||
|
||||
### Goals
|
||||
Split notes into useful retrieval units.
|
||||
|
|
@ -344,7 +344,7 @@ Split notes into useful retrieval units.
|
|||
|
||||
---
|
||||
|
||||
## [ ] GM-017 — Add fallback chunk splitting
|
||||
## [x] GM-017 — Add fallback chunk splitting
|
||||
|
||||
### Goals
|
||||
Handle giant sections safely.
|
||||
|
|
@ -360,7 +360,7 @@ Handle giant sections safely.
|
|||
|
||||
---
|
||||
|
||||
## [ ] GM-018 — Estimate token counts
|
||||
## [x] GM-018 — Estimate token counts
|
||||
|
||||
### Goals
|
||||
Prepare for LLM context budgeting.
|
||||
|
|
@ -378,7 +378,7 @@ Prepare for LLM context budgeting.
|
|||
|
||||
# Phase 5 — Search
|
||||
|
||||
## [ ] GM-019 — Implement SQLite FTS search
|
||||
## [x] GM-019 — Implement SQLite FTS search
|
||||
|
||||
### Goals
|
||||
Add keyword search.
|
||||
|
|
@ -396,7 +396,7 @@ Add keyword search.
|
|||
|
||||
---
|
||||
|
||||
## [ ] GM-020 — Implement basic CLI search command
|
||||
## [x] GM-020 — Implement basic CLI search command
|
||||
|
||||
### Goals
|
||||
Expose usable search interface.
|
||||
|
|
|
|||
210
src/chunk.rs
Normal file
210
src/chunk.rs
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
use serde::Serialize;
|
||||
|
||||
use crate::db::sha256_hex;
|
||||
use crate::markdown::{MarkdownBlock, MarkdownBlockKind};
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct NoteChunk {
|
||||
pub index: usize,
|
||||
pub heading_path: Vec<String>,
|
||||
pub content: String,
|
||||
pub chunk_type: ChunkType,
|
||||
pub start_line: usize,
|
||||
pub end_line: usize,
|
||||
pub token_estimate: usize,
|
||||
pub content_hash: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ChunkType {
|
||||
HeadingSection,
|
||||
SplitSection,
|
||||
}
|
||||
|
||||
pub fn build_chunks(
|
||||
blocks: &[MarkdownBlock],
|
||||
target_tokens: usize,
|
||||
overlap_tokens: usize,
|
||||
) -> Vec<NoteChunk> {
|
||||
let mut chunks = Vec::new();
|
||||
let mut current: Vec<MarkdownBlock> = Vec::new();
|
||||
|
||||
for block in blocks {
|
||||
if matches!(block.kind, MarkdownBlockKind::Heading) && !current.is_empty() {
|
||||
push_section_chunks(&mut chunks, ¤t, target_tokens, overlap_tokens);
|
||||
current.clear();
|
||||
}
|
||||
current.push(block.clone());
|
||||
}
|
||||
|
||||
if !current.is_empty() {
|
||||
push_section_chunks(&mut chunks, ¤t, target_tokens, overlap_tokens);
|
||||
}
|
||||
|
||||
for (index, chunk) in chunks.iter_mut().enumerate() {
|
||||
chunk.index = index;
|
||||
}
|
||||
|
||||
chunks
|
||||
}
|
||||
|
||||
fn push_section_chunks(
|
||||
chunks: &mut Vec<NoteChunk>,
|
||||
section: &[MarkdownBlock],
|
||||
target_tokens: usize,
|
||||
overlap_tokens: usize,
|
||||
) {
|
||||
let text = section_text(section);
|
||||
if text.trim().is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let token_estimate = estimate_tokens(&text);
|
||||
let heading_path = section
|
||||
.iter()
|
||||
.rev()
|
||||
.find(|block| !block.heading_path.is_empty())
|
||||
.map(|block| block.heading_path.clone())
|
||||
.unwrap_or_default();
|
||||
let start_line = section.first().map_or(1, |block| block.start_line);
|
||||
let end_line = section.last().map_or(start_line, |block| block.end_line);
|
||||
|
||||
if token_estimate <= target_tokens {
|
||||
chunks.push(NoteChunk {
|
||||
index: 0,
|
||||
heading_path,
|
||||
content_hash: sha256_hex(&text),
|
||||
content: text,
|
||||
chunk_type: ChunkType::HeadingSection,
|
||||
start_line,
|
||||
end_line,
|
||||
token_estimate,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
// Big sections get split by rough words first. Good enough for now, easy to inspect later.
|
||||
for part in split_with_overlap(&text, target_tokens, overlap_tokens) {
|
||||
let token_estimate = estimate_tokens(&part);
|
||||
chunks.push(NoteChunk {
|
||||
index: 0,
|
||||
heading_path: heading_path.clone(),
|
||||
content_hash: sha256_hex(&part),
|
||||
content: part,
|
||||
chunk_type: ChunkType::SplitSection,
|
||||
start_line,
|
||||
end_line,
|
||||
token_estimate,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
fn section_text(section: &[MarkdownBlock]) -> String {
|
||||
section
|
||||
.iter()
|
||||
.map(|block| block.text.trim())
|
||||
.filter(|text| !text.is_empty())
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n\n")
|
||||
}
|
||||
|
||||
pub fn estimate_tokens(content: &str) -> usize {
|
||||
let words = content.split_whitespace().count();
|
||||
words.max(1)
|
||||
}
|
||||
|
||||
fn split_with_overlap(content: &str, target_tokens: usize, overlap_tokens: usize) -> Vec<String> {
|
||||
let words: Vec<_> = content.split_whitespace().collect();
|
||||
if words.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let mut parts = Vec::new();
|
||||
let mut start = 0;
|
||||
let step = target_tokens.saturating_sub(overlap_tokens).max(1);
|
||||
|
||||
while start < words.len() {
|
||||
let end = (start + target_tokens).min(words.len());
|
||||
let part = words[start..end].join(" ");
|
||||
parts.push(trim_to_sentenceish_boundary(part));
|
||||
if end == words.len() {
|
||||
break;
|
||||
}
|
||||
start += step;
|
||||
}
|
||||
|
||||
parts
|
||||
}
|
||||
|
||||
fn trim_to_sentenceish_boundary(part: String) -> String {
|
||||
if part.ends_with('.') || part.ends_with('!') || part.ends_with('?') || part.len() < 240 {
|
||||
return part;
|
||||
}
|
||||
|
||||
match part.rfind(['.', '!', '?']) {
|
||||
Some(idx) if idx > part.len() / 2 => part[..=idx].to_string(),
|
||||
_ => part,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn chunk_type_name(kind: &ChunkType) -> &'static str {
|
||||
match kind {
|
||||
ChunkType::HeadingSection => "heading_section",
|
||||
ChunkType::SplitSection => "split_section",
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::markdown::{MarkdownBlock, MarkdownBlockKind};
|
||||
|
||||
use super::{ChunkType, build_chunks};
|
||||
|
||||
#[test]
|
||||
fn builds_heading_chunks_in_order() {
|
||||
let blocks = vec![
|
||||
block(MarkdownBlockKind::Heading, "A", 1, vec!["A"]),
|
||||
block(MarkdownBlockKind::Paragraph, "one", 2, vec!["A"]),
|
||||
block(MarkdownBlockKind::Heading, "B", 3, vec!["B"]),
|
||||
block(MarkdownBlockKind::Paragraph, "two", 4, vec!["B"]),
|
||||
];
|
||||
|
||||
let chunks = build_chunks(&blocks, 100, 10);
|
||||
|
||||
assert_eq!(chunks.len(), 2);
|
||||
assert_eq!(chunks[0].heading_path, vec!["A"]);
|
||||
assert_eq!(chunks[1].heading_path, vec!["B"]);
|
||||
assert!(matches!(chunks[0].chunk_type, ChunkType::HeadingSection));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn splits_large_sections_with_overlap() {
|
||||
let text = (0..30)
|
||||
.map(|idx| format!("word{idx}"))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
let blocks = vec![block(MarkdownBlockKind::Paragraph, &text, 1, vec![])];
|
||||
|
||||
let chunks = build_chunks(&blocks, 10, 2);
|
||||
|
||||
assert!(chunks.len() > 1);
|
||||
assert!(chunks.iter().all(|chunk| chunk.token_estimate <= 10));
|
||||
assert!(matches!(chunks[1].chunk_type, ChunkType::SplitSection));
|
||||
}
|
||||
|
||||
fn block(
|
||||
kind: MarkdownBlockKind,
|
||||
text: &str,
|
||||
line: usize,
|
||||
heading_path: Vec<&str>,
|
||||
) -> MarkdownBlock {
|
||||
MarkdownBlock {
|
||||
kind,
|
||||
text: text.to_string(),
|
||||
start_line: line,
|
||||
end_line: line,
|
||||
heading_path: heading_path.into_iter().map(String::from).collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
206
src/db.rs
206
src/db.rs
|
|
@ -6,13 +6,25 @@ use rusqlite::{Connection, OptionalExtension, params};
|
|||
use sha2::{Digest, Sha256};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::markdown::MarkdownBlockKind;
|
||||
use crate::chunk::chunk_type_name;
|
||||
use crate::vault::{IndexWriteSummary, NoteMetadata, VaultIndex};
|
||||
|
||||
const INDEX_VERSION: i64 = 2;
|
||||
|
||||
pub struct IndexStore {
|
||||
conn: Connection,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, serde::Serialize)]
|
||||
pub struct SearchHit {
|
||||
pub path: String,
|
||||
pub title: String,
|
||||
pub heading_path: String,
|
||||
pub snippet: String,
|
||||
pub score: f64,
|
||||
pub token_estimate: usize,
|
||||
}
|
||||
|
||||
impl IndexStore {
|
||||
pub fn open(path: &Path) -> Result<Self> {
|
||||
if let Some(parent) = path.parent() {
|
||||
|
|
@ -34,8 +46,8 @@ impl IndexStore {
|
|||
// This is a rebuildable cache, so changed notes get their child rows replaced in place.
|
||||
for note in &index.notes {
|
||||
summary.notes_seen += 1;
|
||||
let existing_hash = existing_note_hash(&tx, ¬e.path)?;
|
||||
if existing_hash.as_deref() == Some(note.content_hash.as_str()) {
|
||||
let fresh = existing_note_fresh(&tx, ¬e.path, ¬e.content_hash)?;
|
||||
if fresh {
|
||||
summary.unchanged_notes += 1;
|
||||
debug!(path = %note.path.display(), "skipping unchanged note");
|
||||
continue;
|
||||
|
|
@ -49,10 +61,51 @@ impl IndexStore {
|
|||
insert_links(&tx, note_id, note, &mut summary)?;
|
||||
}
|
||||
|
||||
rebuild_fts_if_empty(&tx)?;
|
||||
tx.commit()?;
|
||||
Ok(summary)
|
||||
}
|
||||
|
||||
pub fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchHit>> {
|
||||
let fts_query = fts_query(query);
|
||||
if fts_query.is_empty() {
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let mut stmt = self.conn.prepare(
|
||||
r#"
|
||||
SELECT
|
||||
notes.path,
|
||||
notes.title,
|
||||
chunks.heading_path,
|
||||
snippet(chunks_fts, 0, '[', ']', '...', 18) AS snippet,
|
||||
bm25(chunks_fts) AS score,
|
||||
chunks.token_estimate
|
||||
FROM chunks_fts
|
||||
JOIN chunks ON chunks.id = chunks_fts.rowid
|
||||
JOIN notes ON notes.id = chunks.note_id
|
||||
WHERE chunks_fts MATCH ?1
|
||||
ORDER BY score
|
||||
LIMIT ?2
|
||||
"#,
|
||||
)?;
|
||||
|
||||
let hits = stmt
|
||||
.query_map(params![fts_query, limit as i64], |row| {
|
||||
Ok(SearchHit {
|
||||
path: row.get(0)?,
|
||||
title: row.get(1)?,
|
||||
heading_path: row.get(2)?,
|
||||
snippet: row.get(3)?,
|
||||
score: -row.get::<_, f64>(4)?,
|
||||
token_estimate: row.get::<_, i64>(5)? as usize,
|
||||
})
|
||||
})?
|
||||
.collect::<rusqlite::Result<Vec<_>>>()?;
|
||||
|
||||
Ok(hits)
|
||||
}
|
||||
|
||||
fn bootstrap(&self) -> Result<()> {
|
||||
self.conn.execute_batch(
|
||||
r#"
|
||||
|
|
@ -72,6 +125,7 @@ impl IndexStore {
|
|||
modified_unix_secs INTEGER,
|
||||
file_size INTEGER NOT NULL,
|
||||
content_hash TEXT NOT NULL,
|
||||
index_version INTEGER NOT NULL DEFAULT 2,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
|
@ -91,6 +145,14 @@ impl IndexStore {
|
|||
UNIQUE(note_id, chunk_index)
|
||||
);
|
||||
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts
|
||||
USING fts5(
|
||||
content,
|
||||
path UNINDEXED,
|
||||
title UNINDEXED,
|
||||
heading_path UNINDEXED
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS tags (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL UNIQUE
|
||||
|
|
@ -116,18 +178,39 @@ impl IndexStore {
|
|||
INSERT OR IGNORE INTO migrations (id, name) VALUES (1, 'initial_metadata_index');
|
||||
"#,
|
||||
)?;
|
||||
ensure_notes_index_version(&self.conn)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn existing_note_hash(conn: &Connection, path: &Path) -> Result<Option<String>> {
|
||||
conn.query_row(
|
||||
"SELECT content_hash FROM notes WHERE path = ?1",
|
||||
[path_to_db(path)],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.optional()
|
||||
.context("failed to read existing note hash")
|
||||
fn existing_note_fresh(conn: &Connection, path: &Path, content_hash: &str) -> Result<bool> {
|
||||
let existing = conn
|
||||
.query_row(
|
||||
"SELECT content_hash, index_version FROM notes WHERE path = ?1",
|
||||
[path_to_db(path)],
|
||||
|row| Ok((row.get::<_, String>(0)?, row.get::<_, i64>(1)?)),
|
||||
)
|
||||
.optional()
|
||||
.context("failed to read existing note freshness")?;
|
||||
|
||||
Ok(existing.is_some_and(|(hash, version)| hash == content_hash && version == INDEX_VERSION))
|
||||
}
|
||||
|
||||
fn ensure_notes_index_version(conn: &Connection) -> Result<()> {
|
||||
let mut stmt = conn.prepare("PRAGMA table_info(notes)")?;
|
||||
let columns = stmt
|
||||
.query_map([], |row| row.get::<_, String>(1))?
|
||||
.collect::<rusqlite::Result<Vec<_>>>()?;
|
||||
|
||||
if !columns.iter().any(|column| column == "index_version") {
|
||||
conn.execute(
|
||||
"ALTER TABLE notes ADD COLUMN index_version INTEGER NOT NULL DEFAULT 1",
|
||||
[],
|
||||
)
|
||||
.context("failed to add notes.index_version")?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
|
||||
|
|
@ -140,15 +223,17 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
|
|||
modified_unix_secs,
|
||||
file_size,
|
||||
content_hash,
|
||||
index_version,
|
||||
updated_at
|
||||
)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, CURRENT_TIMESTAMP)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT(path) DO UPDATE SET
|
||||
filename = excluded.filename,
|
||||
title = excluded.title,
|
||||
modified_unix_secs = excluded.modified_unix_secs,
|
||||
file_size = excluded.file_size,
|
||||
content_hash = excluded.content_hash,
|
||||
index_version = excluded.index_version,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
"#,
|
||||
params![
|
||||
|
|
@ -158,6 +243,7 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
|
|||
note.modified_unix_secs,
|
||||
note.file_size,
|
||||
note.content_hash,
|
||||
INDEX_VERSION,
|
||||
],
|
||||
)?;
|
||||
|
||||
|
|
@ -170,6 +256,7 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
|
|||
}
|
||||
|
||||
fn clear_note_children(conn: &Connection, note_id: i64) -> Result<()> {
|
||||
delete_note_fts(conn, note_id)?;
|
||||
conn.execute("DELETE FROM chunks WHERE note_id = ?1", [note_id])?;
|
||||
conn.execute("DELETE FROM note_tags WHERE note_id = ?1", [note_id])?;
|
||||
conn.execute("DELETE FROM links WHERE source_note_id = ?1", [note_id])?;
|
||||
|
|
@ -182,8 +269,8 @@ fn insert_chunks(
|
|||
note: &NoteMetadata,
|
||||
summary: &mut IndexWriteSummary,
|
||||
) -> Result<()> {
|
||||
for (idx, block) in note.blocks.iter().enumerate() {
|
||||
if block.text.trim().is_empty() {
|
||||
for chunk in ¬e.chunks {
|
||||
if chunk.content.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -204,21 +291,73 @@ fn insert_chunks(
|
|||
"#,
|
||||
params![
|
||||
note_id,
|
||||
idx as i64,
|
||||
block.heading_path.join(" > "),
|
||||
block.text,
|
||||
chunk_type(&block.kind),
|
||||
block.start_line as i64,
|
||||
block.end_line as i64,
|
||||
estimate_tokens(&block.text) as i64,
|
||||
sha256_hex(&block.text),
|
||||
chunk.index as i64,
|
||||
chunk.heading_path.join(" > "),
|
||||
chunk.content,
|
||||
chunk_type_name(&chunk.chunk_type),
|
||||
chunk.start_line as i64,
|
||||
chunk.end_line as i64,
|
||||
chunk.token_estimate as i64,
|
||||
chunk.content_hash,
|
||||
],
|
||||
)?;
|
||||
let chunk_id = conn.last_insert_rowid();
|
||||
insert_chunk_fts(conn, chunk_id, note, chunk)?;
|
||||
summary.chunks_written += 1;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_chunk_fts(
|
||||
conn: &Connection,
|
||||
chunk_id: i64,
|
||||
note: &NoteMetadata,
|
||||
chunk: &crate::chunk::NoteChunk,
|
||||
) -> Result<()> {
|
||||
conn.execute(
|
||||
"INSERT INTO chunks_fts (rowid, content, path, title, heading_path) VALUES (?1, ?2, ?3, ?4, ?5)",
|
||||
params![
|
||||
chunk_id,
|
||||
chunk.content,
|
||||
path_to_db(¬e.path),
|
||||
note.title,
|
||||
chunk.heading_path.join(" > "),
|
||||
],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn delete_note_fts(conn: &Connection, note_id: i64) -> Result<()> {
|
||||
let mut stmt = conn.prepare("SELECT id FROM chunks WHERE note_id = ?1")?;
|
||||
let chunk_ids = stmt
|
||||
.query_map([note_id], |row| row.get::<_, i64>(0))?
|
||||
.collect::<rusqlite::Result<Vec<_>>>()?;
|
||||
|
||||
for chunk_id in chunk_ids {
|
||||
conn.execute("DELETE FROM chunks_fts WHERE rowid = ?1", [chunk_id])?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn rebuild_fts_if_empty(conn: &Connection) -> Result<()> {
|
||||
let fts_count: i64 = conn.query_row("SELECT count(*) FROM chunks_fts", [], |row| row.get(0))?;
|
||||
let chunk_count: i64 = conn.query_row("SELECT count(*) FROM chunks", [], |row| row.get(0))?;
|
||||
if fts_count > 0 || chunk_count == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
conn.execute(
|
||||
r#"
|
||||
INSERT INTO chunks_fts (rowid, content, path, title, heading_path)
|
||||
SELECT chunks.id, chunks.content, notes.path, notes.title, chunks.heading_path
|
||||
FROM chunks
|
||||
JOIN notes ON notes.id = chunks.note_id
|
||||
"#,
|
||||
[],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_tags(
|
||||
conn: &Connection,
|
||||
note_id: i64,
|
||||
|
|
@ -259,19 +398,16 @@ pub fn sha256_hex(content: &str) -> String {
|
|||
format!("{:x}", Sha256::digest(content.as_bytes()))
|
||||
}
|
||||
|
||||
fn estimate_tokens(content: &str) -> usize {
|
||||
content.split_whitespace().count().max(1)
|
||||
}
|
||||
|
||||
fn chunk_type(kind: &MarkdownBlockKind) -> &'static str {
|
||||
match kind {
|
||||
MarkdownBlockKind::Heading => "heading",
|
||||
MarkdownBlockKind::Paragraph => "paragraph",
|
||||
MarkdownBlockKind::CodeBlock => "code_block",
|
||||
MarkdownBlockKind::List => "list",
|
||||
}
|
||||
}
|
||||
|
||||
fn path_to_db(path: &Path) -> String {
|
||||
PathBuf::from(path).to_string_lossy().replace('\\', "/")
|
||||
}
|
||||
|
||||
fn fts_query(query: &str) -> String {
|
||||
query
|
||||
.split_whitespace()
|
||||
.map(|term| term.trim_matches(|c: char| !c.is_alphanumeric() && c != '_' && c != '-'))
|
||||
.filter(|term| !term.is_empty())
|
||||
.map(|term| format!("\"{}\"", term.replace('"', "\"\"")))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
|
|
|||
43
src/main.rs
43
src/main.rs
|
|
@ -1,3 +1,4 @@
|
|||
mod chunk;
|
||||
mod cli;
|
||||
mod config;
|
||||
mod db;
|
||||
|
|
@ -11,7 +12,7 @@ use tracing::{debug, info};
|
|||
|
||||
use crate::cli::{Cli, Commands, OutputFormat};
|
||||
use crate::config::Config;
|
||||
use crate::db::IndexStore;
|
||||
use crate::db::{IndexStore, SearchHit};
|
||||
use crate::vault::VaultIndex;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
|
|
@ -54,21 +55,15 @@ fn main() -> Result<()> {
|
|||
limit,
|
||||
output,
|
||||
} => {
|
||||
let index = VaultIndex::scan(&config)?;
|
||||
let results = index.search(&query, limit);
|
||||
let db_path = ensure_index_cache(&config)?;
|
||||
let store = IndexStore::open(&db_path)?;
|
||||
let results = store.search(&query, limit)?;
|
||||
match output {
|
||||
OutputFormat::Text => {
|
||||
if results.is_empty() {
|
||||
println!("No matches.");
|
||||
}
|
||||
for (position, result) in results.iter().enumerate() {
|
||||
println!("{}. {}", position + 1, result.note.path.display());
|
||||
println!(" title: {}", result.note.title);
|
||||
if !result.note.headings.is_empty() {
|
||||
println!(" headings: {}", result.note.headings.join(" > "));
|
||||
}
|
||||
println!(" score: {}", result.score);
|
||||
}
|
||||
print_search_results(&results);
|
||||
}
|
||||
OutputFormat::Json => println!("{}", serde_json::to_string_pretty(&results)?),
|
||||
}
|
||||
|
|
@ -105,3 +100,29 @@ fn init_project(config: &Config, force: bool) -> Result<()> {
|
|||
println!("Config: {}", Config::default_path().display());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn ensure_index_cache(config: &Config) -> Result<std::path::PathBuf> {
|
||||
let db_path = config.vault.path.join(&config.database.path);
|
||||
if db_path.exists() {
|
||||
return Ok(db_path);
|
||||
}
|
||||
|
||||
let index = VaultIndex::scan(config)?;
|
||||
config.create_agent_dirs()?;
|
||||
let mut store = IndexStore::open(&db_path)?;
|
||||
store.write_index(&index)?;
|
||||
Ok(db_path)
|
||||
}
|
||||
|
||||
fn print_search_results(results: &[SearchHit]) {
|
||||
for (position, result) in results.iter().enumerate() {
|
||||
println!("{}. {}", position + 1, result.path);
|
||||
println!(" title: {}", result.title);
|
||||
if !result.heading_path.is_empty() {
|
||||
println!(" heading: {}", result.heading_path);
|
||||
}
|
||||
println!(" tokens: {}", result.token_estimate);
|
||||
println!(" score: {:.4}", result.score);
|
||||
println!(" {}", result.snippet);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
15
src/vault.rs
15
src/vault.rs
|
|
@ -8,6 +8,7 @@ use serde::Serialize;
|
|||
use tracing::{debug, warn};
|
||||
use walkdir::{DirEntry, WalkDir};
|
||||
|
||||
use crate::chunk::{NoteChunk, build_chunks};
|
||||
use crate::config::Config;
|
||||
use crate::db::sha256_hex;
|
||||
use crate::markdown::{MarkdownBlock, Wikilink, parse_markdown};
|
||||
|
|
@ -30,6 +31,7 @@ pub struct NoteMetadata {
|
|||
pub content_hash: String,
|
||||
pub headings: Vec<String>,
|
||||
pub blocks: Vec<MarkdownBlock>,
|
||||
pub chunks: Vec<NoteChunk>,
|
||||
pub wikilinks: Vec<Wikilink>,
|
||||
pub tags: Vec<String>,
|
||||
}
|
||||
|
|
@ -41,6 +43,7 @@ pub struct IndexSummary {
|
|||
pub markdown_files: usize,
|
||||
pub headings: usize,
|
||||
pub blocks: usize,
|
||||
pub chunks: usize,
|
||||
pub wikilinks: usize,
|
||||
pub tags: usize,
|
||||
pub skipped_dirs: Vec<PathBuf>,
|
||||
|
|
@ -99,7 +102,7 @@ impl VaultIndex {
|
|||
continue;
|
||||
}
|
||||
|
||||
let note = read_note(entry.path(), &config.vault.path)?;
|
||||
let note = read_note(entry.path(), &config.vault.path, config)?;
|
||||
debug!(
|
||||
path = %note.path.display(),
|
||||
title = %note.title,
|
||||
|
|
@ -129,6 +132,7 @@ impl VaultIndex {
|
|||
markdown_files: self.markdown_count,
|
||||
headings: self.notes.iter().map(|note| note.headings.len()).sum(),
|
||||
blocks: self.notes.iter().map(|note| note.blocks.len()).sum(),
|
||||
chunks: self.notes.iter().map(|note| note.chunks.len()).sum(),
|
||||
wikilinks: self.notes.iter().map(|note| note.wikilinks.len()).sum(),
|
||||
tags: self.notes.iter().map(|note| note.tags.len()).sum(),
|
||||
skipped_dirs: self.skipped_dirs.clone(),
|
||||
|
|
@ -236,6 +240,7 @@ impl fmt::Display for IndexSummary {
|
|||
writeln!(f, "Markdown files: {}", self.markdown_files)?;
|
||||
writeln!(f, "Headings parsed: {}", self.headings)?;
|
||||
writeln!(f, "Markdown blocks: {}", self.blocks)?;
|
||||
writeln!(f, "Chunks: {}", self.chunks)?;
|
||||
writeln!(f, "Wikilinks: {}", self.wikilinks)?;
|
||||
writeln!(f, "Tags: {}", self.tags)?;
|
||||
writeln!(f, "Skipped dirs: {}", self.skipped_dirs.len())?;
|
||||
|
|
@ -248,7 +253,7 @@ impl fmt::Display for IndexSummary {
|
|||
}
|
||||
}
|
||||
|
||||
fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
|
||||
fn read_note(path: &Path, vault_path: &Path, config: &Config) -> Result<NoteMetadata> {
|
||||
let content =
|
||||
fs::read_to_string(path).with_context(|| format!("failed to read {}", path.display()))?;
|
||||
let metadata =
|
||||
|
|
@ -256,6 +261,11 @@ fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
|
|||
let relative_path = path.strip_prefix(vault_path).unwrap_or(path).to_path_buf();
|
||||
let source_path = relative_path.to_string_lossy().replace('\\', "/");
|
||||
let parsed = parse_markdown(&source_path, &content);
|
||||
let chunks = build_chunks(
|
||||
&parsed.blocks,
|
||||
config.index.chunk_target_tokens,
|
||||
config.index.chunk_overlap_tokens,
|
||||
);
|
||||
let content_hash = sha256_hex(&content);
|
||||
|
||||
Ok(NoteMetadata {
|
||||
|
|
@ -275,6 +285,7 @@ fn read_note(path: &Path, vault_path: &Path) -> Result<NoteMetadata> {
|
|||
content_hash,
|
||||
headings: parsed.headings,
|
||||
blocks: parsed.blocks,
|
||||
chunks,
|
||||
wikilinks: parsed.wikilinks,
|
||||
tags: parsed.tags,
|
||||
})
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user