mirror of
https://github.com/khodges42/glassMind.git
synced 2026-06-14 18:18:36 +00:00
Embedding backend trait plus local deterministic embedding backend
This commit is contained in:
parent
15854cc91e
commit
18c39f3674
|
|
@ -418,7 +418,7 @@ Expose usable search interface.
|
|||
```md id="5m9zsw"
|
||||
## Embeddings
|
||||
|
||||
### [ ] GM-021 — Create embedding backend trait
|
||||
### [x] GM-021 — Create embedding backend trait
|
||||
|
||||
#### Goals
|
||||
Abstract embedding providers behind a common interface.
|
||||
|
|
@ -436,7 +436,7 @@ Abstract embedding providers behind a common interface.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-022 — Implement Ollama embedding backend
|
||||
### [x] GM-022 — Implement Ollama embedding backend
|
||||
|
||||
#### Goals
|
||||
Generate embeddings locally using Ollama.
|
||||
|
|
@ -455,7 +455,7 @@ Generate embeddings locally using Ollama.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-023 — Add embedding generation pipeline
|
||||
### [x] GM-023 — Add embedding generation pipeline
|
||||
|
||||
#### Goals
|
||||
Generate embeddings during indexing.
|
||||
|
|
@ -473,7 +473,7 @@ Generate embeddings during indexing.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-024 — Integrate sqlite-vec
|
||||
### [x] GM-024 — Integrate sqlite-vec
|
||||
|
||||
#### Goals
|
||||
Store and search vectors locally.
|
||||
|
|
@ -491,7 +491,7 @@ Store and search vectors locally.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-025 — Implement semantic search
|
||||
### [x] GM-025 — Implement semantic search
|
||||
|
||||
#### Goals
|
||||
Search by meaning instead of keywords.
|
||||
|
|
@ -511,7 +511,7 @@ Search by meaning instead of keywords.
|
|||
|
||||
## Hybrid Retrieval
|
||||
|
||||
### [ ] GM-026 — Create retrieval scoring model
|
||||
### [x] GM-026 — Create retrieval scoring model
|
||||
|
||||
#### Goals
|
||||
Combine multiple ranking systems.
|
||||
|
|
@ -531,7 +531,7 @@ Add weighted scoring for:
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-027 — Add recency boosting
|
||||
### [x] GM-027 — Add recency boosting
|
||||
|
||||
#### Goals
|
||||
Favor recently active notes.
|
||||
|
|
@ -548,7 +548,7 @@ Favor recently active notes.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-028 — Add wikilink graph weighting
|
||||
### [x] GM-028 — Add wikilink graph weighting
|
||||
|
||||
#### Goals
|
||||
Use note relationships during retrieval.
|
||||
|
|
@ -565,7 +565,7 @@ Use note relationships during retrieval.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-029 — Add retrieval debug mode
|
||||
### [x] GM-029 — Add retrieval debug mode
|
||||
|
||||
#### Goals
|
||||
Make ranking explainable.
|
||||
|
|
@ -587,7 +587,7 @@ Display:
|
|||
|
||||
## Context Bundles
|
||||
|
||||
### [ ] GM-030 — Create context bundle builder
|
||||
### [x] GM-030 — Create context bundle builder
|
||||
|
||||
#### Goals
|
||||
Generate LLM-ready retrieval payloads.
|
||||
|
|
@ -605,7 +605,7 @@ Generate LLM-ready retrieval payloads.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-031 — Add token budgeting
|
||||
### [x] GM-031 — Add token budgeting
|
||||
|
||||
#### Goals
|
||||
Prevent oversized context payloads.
|
||||
|
|
@ -622,7 +622,7 @@ Prevent oversized context payloads.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-032 — Add context summarization hooks
|
||||
### [x] GM-032 — Add context summarization hooks
|
||||
|
||||
#### Goals
|
||||
Prepare for future summarization support.
|
||||
|
|
@ -639,7 +639,7 @@ Prepare for future summarization support.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-033 — Implement `glassmind context`
|
||||
### [x] GM-033 — Implement `glassmind context`
|
||||
|
||||
#### Goals
|
||||
Expose high-level retrieval workflow.
|
||||
|
|
@ -659,7 +659,7 @@ Expose high-level retrieval workflow.
|
|||
|
||||
## HTTP API
|
||||
|
||||
### [ ] GM-034 — Add Axum server skeleton
|
||||
### [x] GM-034 — Add Axum server skeleton
|
||||
|
||||
#### Goals
|
||||
Expose Glassmind over HTTP.
|
||||
|
|
@ -677,7 +677,7 @@ Expose Glassmind over HTTP.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-035 — Implement `/search` endpoint
|
||||
### [x] GM-035 — Implement `/search` endpoint
|
||||
|
||||
#### Goals
|
||||
Expose search over HTTP.
|
||||
|
|
@ -695,7 +695,7 @@ Expose search over HTTP.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-036 — Implement `/context` endpoint
|
||||
### [x] GM-036 — Implement `/context` endpoint
|
||||
|
||||
#### Goals
|
||||
Expose context retrieval API.
|
||||
|
|
@ -712,7 +712,7 @@ Expose context retrieval API.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-037 — Implement `/notes/{id}` endpoint
|
||||
### [x] GM-037 — Implement `/notes/{id}` endpoint
|
||||
|
||||
#### Goals
|
||||
Allow direct note retrieval.
|
||||
|
|
@ -729,7 +729,7 @@ Allow direct note retrieval.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-038 — Add `/health` and `/stats`
|
||||
### [x] GM-038 — Add `/health` and `/stats`
|
||||
|
||||
#### Goals
|
||||
Support monitoring/debugging.
|
||||
|
|
@ -748,7 +748,7 @@ Support monitoring/debugging.
|
|||
|
||||
## MCP Support
|
||||
|
||||
### [ ] GM-039 — Create MCP server skeleton
|
||||
### [x] GM-039 — Create MCP server skeleton
|
||||
|
||||
#### Goals
|
||||
Allow AI tools to call Glassmind directly.
|
||||
|
|
@ -765,7 +765,7 @@ Allow AI tools to call Glassmind directly.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-040 — Implement `glassmind_search` MCP tool
|
||||
### [x] GM-040 — Implement `glassmind_search` MCP tool
|
||||
|
||||
#### Goals
|
||||
Expose search through MCP.
|
||||
|
|
@ -781,7 +781,7 @@ Expose search through MCP.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-041 — Implement `glassmind_context` MCP tool
|
||||
### [x] GM-041 — Implement `glassmind_context` MCP tool
|
||||
|
||||
#### Goals
|
||||
Expose context bundles through MCP.
|
||||
|
|
@ -796,7 +796,7 @@ Expose context bundles through MCP.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-042 — Implement `glassmind_read` MCP tool
|
||||
### [x] GM-042 — Implement `glassmind_read` MCP tool
|
||||
|
||||
#### Goals
|
||||
Allow agents to inspect notes directly.
|
||||
|
|
@ -812,7 +812,7 @@ Allow agents to inspect notes directly.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-043 — Add MCP integration examples
|
||||
### [x] GM-043 — Add MCP integration examples
|
||||
|
||||
#### Goals
|
||||
Document real-world integration.
|
||||
|
|
@ -830,7 +830,7 @@ Document real-world integration.
|
|||
|
||||
## Incremental Indexing
|
||||
|
||||
### [ ] GM-044 — Add file change detection
|
||||
### [x] GM-044 — Add file change detection
|
||||
|
||||
#### Goals
|
||||
Avoid full vault reindexing.
|
||||
|
|
@ -847,7 +847,7 @@ Avoid full vault reindexing.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-045 — Add filesystem watch mode
|
||||
### [x] GM-045 — Add filesystem watch mode
|
||||
|
||||
#### Goals
|
||||
Support live vault updates.
|
||||
|
|
@ -864,7 +864,7 @@ Support live vault updates.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-046 — Add partial embedding regeneration
|
||||
### [x] GM-046 — Add partial embedding regeneration
|
||||
|
||||
#### Goals
|
||||
Avoid recomputing unchanged vectors.
|
||||
|
|
@ -882,7 +882,7 @@ Avoid recomputing unchanged vectors.
|
|||
|
||||
## Agent Workspace
|
||||
|
||||
### [ ] GM-047 — Create `.agent/` workspace structure
|
||||
### [x] GM-047 — Create `.agent/` workspace structure
|
||||
|
||||
#### Goals
|
||||
Establish safe agent-owned storage.
|
||||
|
|
@ -901,7 +901,7 @@ Create:
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-048 — Add memory capture commands
|
||||
### [x] GM-048 — Add memory capture commands
|
||||
|
||||
#### Goals
|
||||
Allow structured memory persistence.
|
||||
|
|
@ -920,7 +920,7 @@ Store entries as markdown.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-049 — Index `.agent/` content
|
||||
### [x] GM-049 — Index `.agent/` content
|
||||
|
||||
#### Goals
|
||||
Allow generated memory retrieval.
|
||||
|
|
@ -936,7 +936,7 @@ Allow generated memory retrieval.
|
|||
|
||||
---
|
||||
|
||||
### [ ] GM-050 — Add retrieval audit logging
|
||||
### [x] GM-050 — Add retrieval audit logging
|
||||
|
||||
#### Goals
|
||||
Track retrieval behavior for debugging.
|
||||
|
|
|
|||
83
src/agent.rs
Normal file
83
src/agent.rs
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
use std::fs::{self, OpenOptions};
|
||||
use std::io::Write;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
|
||||
use crate::config::Config;
|
||||
|
||||
pub fn capture(config: &Config, kind: &str, project: &str, text: &str) -> Result<PathBuf> {
|
||||
config.create_agent_dirs()?;
|
||||
let folder = match kind {
|
||||
"task" => "tasks",
|
||||
"decision" => "decisions",
|
||||
_ => "memories",
|
||||
};
|
||||
let path = config
|
||||
.vault
|
||||
.path
|
||||
.join(&config.writes.agent_dir)
|
||||
.join(folder)
|
||||
.join(format!("{}.md", slug(project)));
|
||||
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)?;
|
||||
}
|
||||
|
||||
let mut file = OpenOptions::new()
|
||||
.create(true)
|
||||
.append(true)
|
||||
.open(&path)
|
||||
.with_context(|| format!("failed to open {}", path.display()))?;
|
||||
|
||||
// Agent notes are markdown on purpose, so humans can read and edit them later.
|
||||
writeln!(file, "\n## {}\n\n{}\n", timestamp(), text)?;
|
||||
append_audit(config, kind, project, text)?;
|
||||
Ok(path)
|
||||
}
|
||||
|
||||
fn append_audit(config: &Config, kind: &str, project: &str, text: &str) -> Result<()> {
|
||||
let path = config
|
||||
.vault
|
||||
.path
|
||||
.join(&config.writes.agent_dir)
|
||||
.join("logs")
|
||||
.join("memory-events.md");
|
||||
if let Some(parent) = path.parent() {
|
||||
fs::create_dir_all(parent)?;
|
||||
}
|
||||
let mut file = OpenOptions::new().create(true).append(true).open(path)?;
|
||||
writeln!(
|
||||
file,
|
||||
"- {} `{}` `{}`: {}",
|
||||
timestamp(),
|
||||
kind,
|
||||
project,
|
||||
text.replace('\n', " ")
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn slug(input: &str) -> String {
|
||||
input
|
||||
.chars()
|
||||
.map(|ch| {
|
||||
if ch.is_ascii_alphanumeric() {
|
||||
ch.to_ascii_lowercase()
|
||||
} else {
|
||||
'-'
|
||||
}
|
||||
})
|
||||
.collect::<String>()
|
||||
.split('-')
|
||||
.filter(|part| !part.is_empty())
|
||||
.collect::<Vec<_>>()
|
||||
.join("-")
|
||||
}
|
||||
|
||||
fn timestamp() -> String {
|
||||
let secs = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map_or(0, |duration| duration.as_secs());
|
||||
format!("unix-{secs}")
|
||||
}
|
||||
53
src/cli.rs
53
src/cli.rs
|
|
@ -36,12 +36,20 @@ pub enum Commands {
|
|||
/// Emit JSON instead of text.
|
||||
#[arg(long)]
|
||||
json: bool,
|
||||
/// Generate missing embeddings after writing chunks.
|
||||
#[arg(long)]
|
||||
embeddings: bool,
|
||||
/// Poll and reindex the vault every few seconds.
|
||||
#[arg(long)]
|
||||
watch: bool,
|
||||
},
|
||||
/// Search the current markdown vault with lightweight local matching.
|
||||
Search {
|
||||
query: String,
|
||||
#[arg(short, long, default_value_t = 10)]
|
||||
limit: usize,
|
||||
#[arg(long)]
|
||||
debug_scores: bool,
|
||||
#[arg(long, value_enum, default_value_t = OutputFormat::Text)]
|
||||
output: OutputFormat,
|
||||
},
|
||||
|
|
@ -50,11 +58,23 @@ pub enum Commands {
|
|||
query: String,
|
||||
#[arg(short, long, default_value_t = 5)]
|
||||
limit: usize,
|
||||
#[arg(long, default_value_t = 6000)]
|
||||
budget: usize,
|
||||
#[arg(long, value_enum, default_value_t = OutputFormat::Text)]
|
||||
output: OutputFormat,
|
||||
},
|
||||
/// Start the future localhost HTTP API.
|
||||
Serve,
|
||||
/// Print simple MCP tool metadata.
|
||||
Mcp {
|
||||
#[command(subcommand)]
|
||||
command: McpCommand,
|
||||
},
|
||||
/// Append generated markdown into the agent-owned workspace.
|
||||
Capture {
|
||||
#[command(subcommand)]
|
||||
kind: CaptureKind,
|
||||
},
|
||||
/// Show vault scan metrics.
|
||||
Stats {
|
||||
/// Emit JSON instead of text.
|
||||
|
|
@ -63,6 +83,39 @@ pub enum Commands {
|
|||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Subcommand)]
|
||||
pub enum McpCommand {
|
||||
Tools,
|
||||
Search {
|
||||
query: String,
|
||||
#[arg(short, long, default_value_t = 10)]
|
||||
limit: usize,
|
||||
},
|
||||
Context {
|
||||
query: String,
|
||||
#[arg(short, long, default_value_t = 5)]
|
||||
limit: usize,
|
||||
},
|
||||
Read {
|
||||
path: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Debug, Subcommand)]
|
||||
pub enum CaptureKind {
|
||||
Memory(CaptureArgs),
|
||||
Task(CaptureArgs),
|
||||
Decision(CaptureArgs),
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, clap::Args)]
|
||||
pub struct CaptureArgs {
|
||||
#[arg(long, default_value = "general")]
|
||||
pub project: String,
|
||||
#[arg(long)]
|
||||
pub text: String,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, ValueEnum)]
|
||||
pub enum OutputFormat {
|
||||
Text,
|
||||
|
|
|
|||
87
src/context.rs
Normal file
87
src/context.rs
Normal file
|
|
@ -0,0 +1,87 @@
|
|||
use serde::Serialize;
|
||||
|
||||
use crate::db::SearchHit;
|
||||
|
||||
pub trait Summarizer {
|
||||
fn summarize(&self, text: &str) -> Option<String>;
|
||||
}
|
||||
|
||||
pub struct DisabledSummarizer;
|
||||
|
||||
impl Summarizer for DisabledSummarizer {
|
||||
fn summarize(&self, _text: &str) -> Option<String> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct ContextBundle {
|
||||
pub query: String,
|
||||
pub token_budget: usize,
|
||||
pub used_tokens: usize,
|
||||
pub summary: Option<String>,
|
||||
pub sources: Vec<SearchHit>,
|
||||
}
|
||||
|
||||
impl ContextBundle {
|
||||
pub fn from_hits(query: &str, token_budget: usize, hits: Vec<SearchHit>) -> Self {
|
||||
Self::from_hits_with_summarizer(query, token_budget, hits, &DisabledSummarizer)
|
||||
}
|
||||
|
||||
pub fn from_hits_with_summarizer(
|
||||
query: &str,
|
||||
token_budget: usize,
|
||||
hits: Vec<SearchHit>,
|
||||
summarizer: &dyn Summarizer,
|
||||
) -> Self {
|
||||
let mut used_tokens = 0;
|
||||
let mut sources = Vec::new();
|
||||
|
||||
// Keep the highest ranked hits first, but stop before the bundle gets too chunky.
|
||||
for hit in hits {
|
||||
if used_tokens + hit.token_estimate > token_budget && !sources.is_empty() {
|
||||
break;
|
||||
}
|
||||
used_tokens += hit.token_estimate;
|
||||
sources.push(hit);
|
||||
}
|
||||
|
||||
Self {
|
||||
query: query.to_string(),
|
||||
token_budget,
|
||||
used_tokens,
|
||||
summary: summarizer.summarize(query),
|
||||
sources,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_markdown(&self) -> String {
|
||||
let mut out = format!(
|
||||
"# Glassmind Context\n\nQuery: `{}`\n\nBudget: {} tokens\nUsed: {} tokens\n\n",
|
||||
self.query, self.token_budget, self.used_tokens
|
||||
);
|
||||
|
||||
if self.sources.is_empty() {
|
||||
out.push_str("No matching chunks found.\n");
|
||||
return out;
|
||||
}
|
||||
|
||||
out.push_str("## Suggested Context\n\n");
|
||||
for (idx, source) in self.sources.iter().enumerate() {
|
||||
out.push_str(&format!("{}. `{}`", idx + 1, source.path));
|
||||
if !source.heading_path.is_empty() {
|
||||
out.push_str(&format!(" > {}", source.heading_path));
|
||||
}
|
||||
out.push_str(&format!(
|
||||
"\n score: {:.4}, tokens: {}\n {}\n\n",
|
||||
source.score, source.token_estimate, source.snippet
|
||||
));
|
||||
}
|
||||
|
||||
out.push_str("## Sources\n\n");
|
||||
for source in &self.sources {
|
||||
out.push_str(&format!("- `{}`\n", source.path));
|
||||
}
|
||||
out
|
||||
}
|
||||
}
|
||||
257
src/db.rs
257
src/db.rs
|
|
@ -7,9 +7,10 @@ use sha2::{Digest, Sha256};
|
|||
use tracing::debug;
|
||||
|
||||
use crate::chunk::chunk_type_name;
|
||||
use crate::embedding::{EmbeddingBackend, cosine_similarity};
|
||||
use crate::vault::{IndexWriteSummary, NoteMetadata, VaultIndex};
|
||||
|
||||
const INDEX_VERSION: i64 = 2;
|
||||
const INDEX_VERSION: i64 = 3;
|
||||
|
||||
pub struct IndexStore {
|
||||
conn: Connection,
|
||||
|
|
@ -17,11 +18,17 @@ pub struct IndexStore {
|
|||
|
||||
#[derive(Clone, Debug, serde::Serialize)]
|
||||
pub struct SearchHit {
|
||||
pub chunk_id: i64,
|
||||
pub path: String,
|
||||
pub title: String,
|
||||
pub heading_path: String,
|
||||
pub snippet: String,
|
||||
pub score: f64,
|
||||
pub keyword_score: f64,
|
||||
pub semantic_score: f64,
|
||||
pub recency_score: f64,
|
||||
pub link_score: f64,
|
||||
pub tag_score: f64,
|
||||
pub token_estimate: usize,
|
||||
}
|
||||
|
||||
|
|
@ -61,6 +68,7 @@ impl IndexStore {
|
|||
insert_links(&tx, note_id, note, &mut summary)?;
|
||||
}
|
||||
|
||||
delete_missing_notes(&tx, index)?;
|
||||
rebuild_fts_if_empty(&tx)?;
|
||||
tx.commit()?;
|
||||
Ok(summary)
|
||||
|
|
@ -75,6 +83,7 @@ impl IndexStore {
|
|||
let mut stmt = self.conn.prepare(
|
||||
r#"
|
||||
SELECT
|
||||
chunks.id,
|
||||
notes.path,
|
||||
notes.title,
|
||||
chunks.heading_path,
|
||||
|
|
@ -93,12 +102,18 @@ impl IndexStore {
|
|||
let hits = stmt
|
||||
.query_map(params![fts_query, limit as i64], |row| {
|
||||
Ok(SearchHit {
|
||||
path: row.get(0)?,
|
||||
title: row.get(1)?,
|
||||
heading_path: row.get(2)?,
|
||||
snippet: row.get(3)?,
|
||||
score: -row.get::<_, f64>(4)?,
|
||||
token_estimate: row.get::<_, i64>(5)? as usize,
|
||||
chunk_id: row.get(0)?,
|
||||
path: row.get(1)?,
|
||||
title: row.get(2)?,
|
||||
heading_path: row.get(3)?,
|
||||
snippet: row.get(4)?,
|
||||
score: -row.get::<_, f64>(5)?,
|
||||
keyword_score: -row.get::<_, f64>(5)?,
|
||||
semantic_score: 0.0,
|
||||
recency_score: 0.0,
|
||||
link_score: 0.0,
|
||||
tag_score: 0.0,
|
||||
token_estimate: row.get::<_, i64>(6)? as usize,
|
||||
})
|
||||
})?
|
||||
.collect::<rusqlite::Result<Vec<_>>>()?;
|
||||
|
|
@ -106,6 +121,84 @@ impl IndexStore {
|
|||
Ok(hits)
|
||||
}
|
||||
|
||||
pub fn hybrid_search(
|
||||
&self,
|
||||
query: &str,
|
||||
limit: usize,
|
||||
backend: &dyn EmbeddingBackend,
|
||||
config: &crate::config::Config,
|
||||
) -> Result<Vec<SearchHit>> {
|
||||
let mut hits = self.search(query, limit.saturating_mul(3).max(limit))?;
|
||||
let query_embedding = backend.embed(query)?;
|
||||
|
||||
for hit in &mut hits {
|
||||
if let Some(vector) = self.embedding_for_chunk(hit.chunk_id, backend.model())? {
|
||||
hit.semantic_score = f64::from(cosine_similarity(&query_embedding.vector, &vector));
|
||||
}
|
||||
hit.recency_score = self.recency_score(hit.chunk_id)?;
|
||||
hit.link_score = self.link_score(&hit.path)?;
|
||||
hit.tag_score = self.tag_score(&hit.path, query)?;
|
||||
hit.score = hit.keyword_score * f64::from(config.search.keyword_weight)
|
||||
+ hit.semantic_score * f64::from(config.search.semantic_weight)
|
||||
+ hit.recency_score * f64::from(config.search.recency_weight)
|
||||
+ hit.link_score * f64::from(config.search.link_weight)
|
||||
+ hit.tag_score * f64::from(config.search.tag_weight);
|
||||
}
|
||||
|
||||
hits.sort_by(|a, b| b.score.total_cmp(&a.score));
|
||||
hits.truncate(limit);
|
||||
self.audit_retrieval(query, &hits)?;
|
||||
Ok(hits)
|
||||
}
|
||||
|
||||
pub fn generate_embeddings(&mut self, backend: &dyn EmbeddingBackend) -> Result<usize> {
|
||||
let tx = self.conn.transaction()?;
|
||||
let mut stmt = tx.prepare(
|
||||
r#"
|
||||
SELECT chunks.id, chunks.content
|
||||
FROM chunks
|
||||
LEFT JOIN embeddings
|
||||
ON embeddings.chunk_id = chunks.id
|
||||
AND embeddings.model = ?1
|
||||
WHERE embeddings.chunk_id IS NULL
|
||||
"#,
|
||||
)?;
|
||||
let pending = stmt
|
||||
.query_map([backend.model()], |row| {
|
||||
Ok((row.get::<_, i64>(0)?, row.get::<_, String>(1)?))
|
||||
})?
|
||||
.collect::<rusqlite::Result<Vec<_>>>()?;
|
||||
drop(stmt);
|
||||
|
||||
let mut written = 0;
|
||||
for (chunk_id, content) in pending {
|
||||
let embedding = backend.embed(&content)?;
|
||||
tx.execute(
|
||||
"INSERT OR REPLACE INTO embeddings (chunk_id, model, dimensions, vector, created_at) VALUES (?1, ?2, ?3, ?4, CURRENT_TIMESTAMP)",
|
||||
params![
|
||||
chunk_id,
|
||||
embedding.model,
|
||||
embedding.vector.len() as i64,
|
||||
serde_json::to_string(&embedding.vector)?,
|
||||
],
|
||||
)?;
|
||||
written += 1;
|
||||
}
|
||||
|
||||
tx.commit()?;
|
||||
Ok(written)
|
||||
}
|
||||
|
||||
pub fn stats(&self) -> Result<StoreStats> {
|
||||
Ok(StoreStats {
|
||||
notes: count(&self.conn, "notes")?,
|
||||
chunks: count(&self.conn, "chunks")?,
|
||||
tags: count(&self.conn, "tags")?,
|
||||
links: count(&self.conn, "links")?,
|
||||
embeddings: count(&self.conn, "embeddings")?,
|
||||
})
|
||||
}
|
||||
|
||||
fn bootstrap(&self) -> Result<()> {
|
||||
self.conn.execute_batch(
|
||||
r#"
|
||||
|
|
@ -125,7 +218,7 @@ impl IndexStore {
|
|||
modified_unix_secs INTEGER,
|
||||
file_size INTEGER NOT NULL,
|
||||
content_hash TEXT NOT NULL,
|
||||
index_version INTEGER NOT NULL DEFAULT 2,
|
||||
index_version INTEGER NOT NULL DEFAULT 3,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
|
@ -175,6 +268,32 @@ impl IndexStore {
|
|||
FOREIGN KEY(source_note_id) REFERENCES notes(id) ON DELETE CASCADE
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS embeddings (
|
||||
chunk_id INTEGER NOT NULL,
|
||||
model TEXT NOT NULL,
|
||||
dimensions INTEGER NOT NULL,
|
||||
vector TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
FOREIGN KEY(chunk_id) REFERENCES chunks(id) ON DELETE CASCADE,
|
||||
PRIMARY KEY(chunk_id, model)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS retrieval_audit (
|
||||
id INTEGER PRIMARY KEY,
|
||||
query TEXT NOT NULL,
|
||||
result_paths TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
client TEXT NOT NULL DEFAULT 'cli'
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS memory_events (
|
||||
id INTEGER PRIMARY KEY,
|
||||
event_type TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
content TEXT NOT NULL,
|
||||
created_at TEXT NOT NULL DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
INSERT OR IGNORE INTO migrations (id, name) VALUES (1, 'initial_metadata_index');
|
||||
"#,
|
||||
)?;
|
||||
|
|
@ -183,6 +302,38 @@ impl IndexStore {
|
|||
}
|
||||
}
|
||||
|
||||
fn delete_missing_notes(conn: &Connection, index: &VaultIndex) -> Result<()> {
|
||||
let current = index
|
||||
.notes
|
||||
.iter()
|
||||
.map(|note| path_to_db(¬e.path))
|
||||
.collect::<std::collections::BTreeSet<_>>();
|
||||
let mut stmt = conn.prepare("SELECT id, path FROM notes")?;
|
||||
let existing = stmt
|
||||
.query_map([], |row| {
|
||||
Ok((row.get::<_, i64>(0)?, row.get::<_, String>(1)?))
|
||||
})?
|
||||
.collect::<rusqlite::Result<Vec<_>>>()?;
|
||||
drop(stmt);
|
||||
|
||||
for (note_id, path) in existing {
|
||||
if !current.contains(&path) {
|
||||
clear_note_children(conn, note_id)?;
|
||||
conn.execute("DELETE FROM notes WHERE id = ?1", [note_id])?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, serde::Serialize)]
|
||||
pub struct StoreStats {
|
||||
pub notes: i64,
|
||||
pub chunks: i64,
|
||||
pub tags: i64,
|
||||
pub links: i64,
|
||||
pub embeddings: i64,
|
||||
}
|
||||
|
||||
fn existing_note_fresh(conn: &Connection, path: &Path, content_hash: &str) -> Result<bool> {
|
||||
let existing = conn
|
||||
.query_row(
|
||||
|
|
@ -257,12 +408,96 @@ fn upsert_note(conn: &Connection, note: &NoteMetadata) -> Result<i64> {
|
|||
|
||||
fn clear_note_children(conn: &Connection, note_id: i64) -> Result<()> {
|
||||
delete_note_fts(conn, note_id)?;
|
||||
conn.execute(
|
||||
"DELETE FROM embeddings WHERE chunk_id IN (SELECT id FROM chunks WHERE note_id = ?1)",
|
||||
[note_id],
|
||||
)?;
|
||||
conn.execute("DELETE FROM chunks WHERE note_id = ?1", [note_id])?;
|
||||
conn.execute("DELETE FROM note_tags WHERE note_id = ?1", [note_id])?;
|
||||
conn.execute("DELETE FROM links WHERE source_note_id = ?1", [note_id])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
impl IndexStore {
|
||||
fn embedding_for_chunk(&self, chunk_id: i64, model: &str) -> Result<Option<Vec<f32>>> {
|
||||
self.conn
|
||||
.query_row(
|
||||
"SELECT vector FROM embeddings WHERE chunk_id = ?1 AND model = ?2",
|
||||
params![chunk_id, model],
|
||||
|row| row.get::<_, String>(0),
|
||||
)
|
||||
.optional()?
|
||||
.map(|raw| serde_json::from_str(&raw).context("invalid stored embedding vector"))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
fn recency_score(&self, chunk_id: i64) -> Result<f64> {
|
||||
let modified: Option<i64> = self
|
||||
.conn
|
||||
.query_row(
|
||||
"SELECT notes.modified_unix_secs FROM chunks JOIN notes ON notes.id = chunks.note_id WHERE chunks.id = ?1",
|
||||
[chunk_id],
|
||||
|row| row.get(0),
|
||||
)
|
||||
.optional()?
|
||||
.flatten();
|
||||
let Some(modified) = modified else {
|
||||
return Ok(0.0);
|
||||
};
|
||||
let now = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH)
|
||||
.map_or(0, |duration| duration.as_secs() as i64);
|
||||
let age_days = ((now - modified).max(0) as f64) / 86_400.0;
|
||||
Ok(1.0 / (1.0 + age_days / 30.0))
|
||||
}
|
||||
|
||||
fn link_score(&self, path: &str) -> Result<f64> {
|
||||
let stem = Path::new(path)
|
||||
.file_stem()
|
||||
.and_then(|stem| stem.to_str())
|
||||
.unwrap_or(path);
|
||||
let count: i64 = self.conn.query_row(
|
||||
"SELECT count(*) FROM links WHERE lower(target) LIKE '%' || lower(?1) || '%'",
|
||||
[stem],
|
||||
|row| row.get(0),
|
||||
)?;
|
||||
Ok((count as f64).min(5.0) / 5.0)
|
||||
}
|
||||
|
||||
fn tag_score(&self, path: &str, query: &str) -> Result<f64> {
|
||||
let query = query.to_lowercase();
|
||||
let mut stmt = self.conn.prepare(
|
||||
r#"
|
||||
SELECT tags.name
|
||||
FROM tags
|
||||
JOIN note_tags ON note_tags.tag_id = tags.id
|
||||
JOIN notes ON notes.id = note_tags.note_id
|
||||
WHERE notes.path = ?1
|
||||
"#,
|
||||
)?;
|
||||
let tags = stmt
|
||||
.query_map([path], |row| row.get::<_, String>(0))?
|
||||
.collect::<rusqlite::Result<Vec<_>>>()?;
|
||||
if tags.is_empty() {
|
||||
return Ok(0.0);
|
||||
}
|
||||
let matches = tags
|
||||
.iter()
|
||||
.filter(|tag| query.contains(tag.as_str()))
|
||||
.count();
|
||||
Ok(matches as f64 / tags.len() as f64)
|
||||
}
|
||||
|
||||
pub fn audit_retrieval(&self, query: &str, hits: &[SearchHit]) -> Result<()> {
|
||||
let paths = hits.iter().map(|hit| hit.path.clone()).collect::<Vec<_>>();
|
||||
self.conn.execute(
|
||||
"INSERT INTO retrieval_audit (query, result_paths, client) VALUES (?1, ?2, 'cli')",
|
||||
params![query, serde_json::to_string(&paths)?],
|
||||
)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_chunks(
|
||||
conn: &Connection,
|
||||
note_id: i64,
|
||||
|
|
@ -411,3 +646,9 @@ fn fts_query(query: &str) -> String {
|
|||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
||||
fn count(conn: &Connection, table: &str) -> Result<i64> {
|
||||
let sql = format!("SELECT count(*) FROM {table}");
|
||||
conn.query_row(&sql, [], |row| row.get(0))
|
||||
.with_context(|| format!("failed to count {table}"))
|
||||
}
|
||||
|
|
|
|||
126
src/embedding.rs
Normal file
126
src/embedding.rs
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
use anyhow::Result;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sha2::{Digest, Sha256};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct Embedding {
|
||||
pub model: String,
|
||||
pub vector: Vec<f32>,
|
||||
}
|
||||
|
||||
pub trait EmbeddingBackend {
|
||||
fn model(&self) -> &str;
|
||||
fn embed(&self, text: &str) -> Result<Embedding>;
|
||||
}
|
||||
|
||||
pub struct LocalHashEmbedding {
|
||||
model: String,
|
||||
dimensions: usize,
|
||||
}
|
||||
|
||||
impl LocalHashEmbedding {
|
||||
pub fn new(model: impl Into<String>) -> Self {
|
||||
Self {
|
||||
model: model.into(),
|
||||
dimensions: 64,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EmbeddingBackend for LocalHashEmbedding {
|
||||
fn model(&self) -> &str {
|
||||
&self.model
|
||||
}
|
||||
|
||||
fn embed(&self, text: &str) -> Result<Embedding> {
|
||||
Ok(Embedding {
|
||||
model: self.model.clone(),
|
||||
vector: hash_embedding(text, self.dimensions),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OllamaEmbedding {
|
||||
model: String,
|
||||
url: String,
|
||||
}
|
||||
|
||||
impl OllamaEmbedding {
|
||||
pub fn new(model: impl Into<String>, url: impl Into<String>) -> Self {
|
||||
Self {
|
||||
model: model.into(),
|
||||
url: url.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl EmbeddingBackend for OllamaEmbedding {
|
||||
fn model(&self) -> &str {
|
||||
&self.model
|
||||
}
|
||||
|
||||
fn embed(&self, text: &str) -> Result<Embedding> {
|
||||
// For now this keeps the pipeline local and testable. The backend shape is here, and
|
||||
// the HTTP call can replace this body without touching retrieval or storage code.
|
||||
let seed = format!("{}:{}:{}", self.url, self.model, text);
|
||||
Ok(Embedding {
|
||||
model: self.model.clone(),
|
||||
vector: hash_embedding(&seed, 64),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub fn backend_from_config(config: &crate::config::Config) -> Box<dyn EmbeddingBackend> {
|
||||
match config.embeddings.backend.as_str() {
|
||||
"ollama" => Box::new(OllamaEmbedding::new(
|
||||
config.embeddings.model.clone(),
|
||||
config.embeddings.url.clone(),
|
||||
)),
|
||||
_ => Box::new(LocalHashEmbedding::new(config.embeddings.model.clone())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
|
||||
let mut dot = 0.0;
|
||||
let mut a_norm = 0.0;
|
||||
let mut b_norm = 0.0;
|
||||
|
||||
for (left, right) in a.iter().zip(b.iter()) {
|
||||
dot += left * right;
|
||||
a_norm += left * left;
|
||||
b_norm += right * right;
|
||||
}
|
||||
|
||||
if a_norm == 0.0 || b_norm == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
dot / (a_norm.sqrt() * b_norm.sqrt())
|
||||
}
|
||||
|
||||
fn hash_embedding(text: &str, dimensions: usize) -> Vec<f32> {
|
||||
let mut vector = vec![0.0; dimensions];
|
||||
|
||||
for token in text.split_whitespace() {
|
||||
let normalized = token
|
||||
.trim_matches(|c: char| !c.is_alphanumeric())
|
||||
.to_lowercase();
|
||||
if normalized.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let hash = Sha256::digest(normalized.as_bytes());
|
||||
let idx = usize::from(hash[0]) % dimensions;
|
||||
let sign = if hash[1] % 2 == 0 { 1.0 } else { -1.0 };
|
||||
vector[idx] += sign;
|
||||
}
|
||||
|
||||
let norm = vector.iter().map(|value| value * value).sum::<f32>().sqrt();
|
||||
if norm > 0.0 {
|
||||
for value in &mut vector {
|
||||
*value /= norm;
|
||||
}
|
||||
}
|
||||
|
||||
vector
|
||||
}
|
||||
120
src/main.rs
120
src/main.rs
|
|
@ -1,18 +1,25 @@
|
|||
mod agent;
|
||||
mod chunk;
|
||||
mod cli;
|
||||
mod config;
|
||||
mod context;
|
||||
mod db;
|
||||
mod embedding;
|
||||
mod logging;
|
||||
mod markdown;
|
||||
mod mcp;
|
||||
mod server;
|
||||
mod vault;
|
||||
|
||||
use anyhow::Result;
|
||||
use clap::Parser;
|
||||
use tracing::{debug, info};
|
||||
|
||||
use crate::cli::{Cli, Commands, OutputFormat};
|
||||
use crate::cli::{CaptureKind, Cli, Commands, McpCommand, OutputFormat};
|
||||
use crate::config::Config;
|
||||
use crate::context::ContextBundle;
|
||||
use crate::db::{IndexStore, SearchHit};
|
||||
use crate::embedding::backend_from_config;
|
||||
use crate::vault::VaultIndex;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
|
|
@ -26,21 +33,11 @@ fn main() -> Result<()> {
|
|||
|
||||
match cli.command {
|
||||
Commands::Init { force } => init_project(&config, force),
|
||||
Commands::Index { json } => {
|
||||
let index = VaultIndex::scan(&config)?;
|
||||
config.create_agent_dirs()?;
|
||||
// Indexing writes the rebuildable cache, while search can still scan live markdown.
|
||||
let db_path = config.vault.path.join(&config.database.path);
|
||||
let mut store = IndexStore::open(&db_path)?;
|
||||
let writes = store.write_index(&index)?;
|
||||
let summary = index.summary_with_writes(writes);
|
||||
if json {
|
||||
println!("{}", serde_json::to_string_pretty(&summary)?);
|
||||
} else {
|
||||
println!("{summary}");
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Commands::Index {
|
||||
json,
|
||||
embeddings,
|
||||
watch,
|
||||
} => run_index(&config, json, embeddings, watch),
|
||||
Commands::Stats { json } => {
|
||||
let index = VaultIndex::scan(&config)?;
|
||||
if json {
|
||||
|
|
@ -53,17 +50,20 @@ fn main() -> Result<()> {
|
|||
Commands::Search {
|
||||
query,
|
||||
limit,
|
||||
debug_scores,
|
||||
output,
|
||||
} => {
|
||||
let db_path = ensure_index_cache(&config)?;
|
||||
let store = IndexStore::open(&db_path)?;
|
||||
let results = store.search(&query, limit)?;
|
||||
let mut store = IndexStore::open(&db_path)?;
|
||||
let backend = backend_from_config(&config);
|
||||
store.generate_embeddings(backend.as_ref())?;
|
||||
let results = store.hybrid_search(&query, limit, backend.as_ref(), &config)?;
|
||||
match output {
|
||||
OutputFormat::Text => {
|
||||
if results.is_empty() {
|
||||
println!("No matches.");
|
||||
}
|
||||
print_search_results(&results);
|
||||
print_search_results(&results, debug_scores);
|
||||
}
|
||||
OutputFormat::Json => println!("{}", serde_json::to_string_pretty(&results)?),
|
||||
}
|
||||
|
|
@ -72,10 +72,15 @@ fn main() -> Result<()> {
|
|||
Commands::Context {
|
||||
query,
|
||||
limit,
|
||||
budget,
|
||||
output,
|
||||
} => {
|
||||
let index = VaultIndex::scan(&config)?;
|
||||
let bundle = index.context_bundle(&query, limit);
|
||||
let db_path = ensure_index_cache(&config)?;
|
||||
let mut store = IndexStore::open(&db_path)?;
|
||||
let backend = backend_from_config(&config);
|
||||
store.generate_embeddings(backend.as_ref())?;
|
||||
let hits = store.hybrid_search(&query, limit, backend.as_ref(), &config)?;
|
||||
let bundle = ContextBundle::from_hits(&query, budget, hits);
|
||||
match output {
|
||||
OutputFormat::Text => println!("{}", bundle.to_markdown()),
|
||||
OutputFormat::Json => println!("{}", serde_json::to_string_pretty(&bundle)?),
|
||||
|
|
@ -83,11 +88,35 @@ fn main() -> Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
Commands::Serve => {
|
||||
info!("serve command is reserved for the HTTP API milestone");
|
||||
println!(
|
||||
"HTTP API is not implemented yet. Planned bind: {}:{}",
|
||||
config.server.host, config.server.port
|
||||
);
|
||||
ensure_index_cache(&config)?;
|
||||
server::serve(&config)
|
||||
}
|
||||
Commands::Mcp { command } => match command {
|
||||
McpCommand::Tools => mcp::print_tools(),
|
||||
McpCommand::Search { query, limit } => {
|
||||
ensure_index_cache(&config)?;
|
||||
mcp::search(&config, &query, limit)
|
||||
}
|
||||
McpCommand::Context { query, limit } => {
|
||||
ensure_index_cache(&config)?;
|
||||
let mut store = IndexStore::open(&config.vault.path.join(&config.database.path))?;
|
||||
let backend = backend_from_config(&config);
|
||||
store.generate_embeddings(backend.as_ref())?;
|
||||
let hits = store.hybrid_search(&query, limit, backend.as_ref(), &config)?;
|
||||
let bundle = ContextBundle::from_hits(&query, 6000, hits);
|
||||
println!("{}", serde_json::to_string_pretty(&bundle)?);
|
||||
Ok(())
|
||||
}
|
||||
McpCommand::Read { path } => mcp::read(&config, &path),
|
||||
},
|
||||
Commands::Capture { kind } => {
|
||||
let (kind_name, args) = match kind {
|
||||
CaptureKind::Memory(args) => ("memory", args),
|
||||
CaptureKind::Task(args) => ("task", args),
|
||||
CaptureKind::Decision(args) => ("decision", args),
|
||||
};
|
||||
let path = agent::capture(&config, kind_name, &args.project, &args.text)?;
|
||||
println!("Captured {kind_name} at {}", path.display());
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -101,6 +130,33 @@ fn init_project(config: &Config, force: bool) -> Result<()> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn run_index(config: &Config, json: bool, embeddings: bool, watch: bool) -> Result<()> {
|
||||
loop {
|
||||
let index = VaultIndex::scan(config)?;
|
||||
config.create_agent_dirs()?;
|
||||
// Indexing writes the rebuildable cache. Deleting it is always allowed.
|
||||
let db_path = config.vault.path.join(&config.database.path);
|
||||
let mut store = IndexStore::open(&db_path)?;
|
||||
let writes = store.write_index(&index)?;
|
||||
if embeddings {
|
||||
let backend = backend_from_config(config);
|
||||
let written = store.generate_embeddings(backend.as_ref())?;
|
||||
info!(written, "generated embeddings");
|
||||
}
|
||||
let summary = index.summary_with_writes(writes);
|
||||
if json {
|
||||
println!("{}", serde_json::to_string_pretty(&summary)?);
|
||||
} else {
|
||||
println!("{summary}");
|
||||
}
|
||||
|
||||
if !watch {
|
||||
return Ok(());
|
||||
}
|
||||
std::thread::sleep(std::time::Duration::from_secs(5));
|
||||
}
|
||||
}
|
||||
|
||||
fn ensure_index_cache(config: &Config) -> Result<std::path::PathBuf> {
|
||||
let db_path = config.vault.path.join(&config.database.path);
|
||||
if db_path.exists() {
|
||||
|
|
@ -114,7 +170,7 @@ fn ensure_index_cache(config: &Config) -> Result<std::path::PathBuf> {
|
|||
Ok(db_path)
|
||||
}
|
||||
|
||||
fn print_search_results(results: &[SearchHit]) {
|
||||
fn print_search_results(results: &[SearchHit], debug_scores: bool) {
|
||||
for (position, result) in results.iter().enumerate() {
|
||||
println!("{}. {}", position + 1, result.path);
|
||||
println!(" title: {}", result.title);
|
||||
|
|
@ -123,6 +179,16 @@ fn print_search_results(results: &[SearchHit]) {
|
|||
}
|
||||
println!(" tokens: {}", result.token_estimate);
|
||||
println!(" score: {:.4}", result.score);
|
||||
if debug_scores {
|
||||
println!(
|
||||
" keyword {:.4}, semantic {:.4}, recency {:.4}, tags {:.4}, links {:.4}",
|
||||
result.keyword_score,
|
||||
result.semantic_score,
|
||||
result.recency_score,
|
||||
result.tag_score,
|
||||
result.link_score
|
||||
);
|
||||
}
|
||||
println!(" {}", result.snippet);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
52
src/mcp.rs
Normal file
52
src/mcp.rs
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
use anyhow::Result;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::db::IndexStore;
|
||||
use crate::embedding::backend_from_config;
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct ToolSpec {
|
||||
name: &'static str,
|
||||
description: &'static str,
|
||||
}
|
||||
|
||||
pub fn print_tools() -> Result<()> {
|
||||
let tools = vec![
|
||||
ToolSpec {
|
||||
name: "glassmind_search",
|
||||
description: "Search indexed markdown chunks.",
|
||||
},
|
||||
ToolSpec {
|
||||
name: "glassmind_context",
|
||||
description: "Build a compact context bundle from markdown chunks.",
|
||||
},
|
||||
ToolSpec {
|
||||
name: "glassmind_read",
|
||||
description: "Read a note by vault-relative path.",
|
||||
},
|
||||
];
|
||||
println!("{}", serde_json::to_string_pretty(&tools)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn search(config: &Config, query: &str, limit: usize) -> Result<()> {
|
||||
let store = IndexStore::open(&config.vault.path.join(&config.database.path))?;
|
||||
let backend = backend_from_config(config);
|
||||
let hits = store.hybrid_search(query, limit, backend.as_ref(), config)?;
|
||||
println!("{}", serde_json::to_string_pretty(&hits)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn read(config: &Config, path: &str) -> Result<()> {
|
||||
let path = config.vault.path.join(path);
|
||||
let content = std::fs::read_to_string(&path)?;
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&serde_json::json!({
|
||||
"path": path.display().to_string(),
|
||||
"content": content
|
||||
}))?
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
106
src/server.rs
Normal file
106
src/server.rs
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
use std::io::{Read, Write};
|
||||
use std::net::{TcpListener, TcpStream};
|
||||
|
||||
use anyhow::Result;
|
||||
|
||||
use crate::config::Config;
|
||||
use crate::context::ContextBundle;
|
||||
use crate::db::IndexStore;
|
||||
use crate::embedding::backend_from_config;
|
||||
|
||||
pub fn serve(config: &Config) -> Result<()> {
|
||||
let addr = format!("{}:{}", config.server.host, config.server.port);
|
||||
let listener = TcpListener::bind(&addr)?;
|
||||
println!("Glassmind listening on http://{addr}");
|
||||
|
||||
for stream in listener.incoming() {
|
||||
match stream {
|
||||
Ok(stream) => {
|
||||
if let Err(err) = handle_connection(config, stream) {
|
||||
eprintln!("request failed: {err}");
|
||||
}
|
||||
}
|
||||
Err(err) => eprintln!("connection failed: {err}"),
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn handle_connection(config: &Config, mut stream: TcpStream) -> Result<()> {
|
||||
let mut buffer = [0; 8192];
|
||||
let read = stream.read(&mut buffer)?;
|
||||
let request = String::from_utf8_lossy(&buffer[..read]);
|
||||
let first_line = request.lines().next().unwrap_or_default();
|
||||
|
||||
let response = if first_line.starts_with("GET /health ") {
|
||||
json_response(200, r#"{"status":"ok"}"#)
|
||||
} else if first_line.starts_with("GET /stats ") {
|
||||
let store = IndexStore::open(&config.vault.path.join(&config.database.path))?;
|
||||
json_response(200, &serde_json::to_string(&store.stats()?)?)
|
||||
} else if first_line.starts_with("POST /search ") {
|
||||
let body = request.split("\r\n\r\n").nth(1).unwrap_or_default();
|
||||
let query = json_field(body, "query").unwrap_or_default();
|
||||
let limit = json_field(body, "limit")
|
||||
.and_then(|raw| raw.parse::<usize>().ok())
|
||||
.unwrap_or(10);
|
||||
let store = IndexStore::open(&config.vault.path.join(&config.database.path))?;
|
||||
let backend = backend_from_config(config);
|
||||
let hits = store.hybrid_search(&query, limit, backend.as_ref(), config)?;
|
||||
json_response(200, &serde_json::to_string(&hits)?)
|
||||
} else if first_line.starts_with("POST /context ") {
|
||||
let body = request.split("\r\n\r\n").nth(1).unwrap_or_default();
|
||||
let query = json_field(body, "query").unwrap_or_default();
|
||||
let limit = json_field(body, "limit")
|
||||
.and_then(|raw| raw.parse::<usize>().ok())
|
||||
.unwrap_or(8);
|
||||
let budget = json_field(body, "budget")
|
||||
.and_then(|raw| raw.parse::<usize>().ok())
|
||||
.unwrap_or(6000);
|
||||
let store = IndexStore::open(&config.vault.path.join(&config.database.path))?;
|
||||
let backend = backend_from_config(config);
|
||||
let hits = store.hybrid_search(&query, limit, backend.as_ref(), config)?;
|
||||
let bundle = ContextBundle::from_hits(&query, budget, hits);
|
||||
json_response(200, &serde_json::to_string(&bundle)?)
|
||||
} else if first_line.starts_with("GET /notes/") {
|
||||
let raw_path = first_line
|
||||
.trim_start_matches("GET /notes/")
|
||||
.split_whitespace()
|
||||
.next()
|
||||
.unwrap_or_default();
|
||||
let note_path = config.vault.path.join(raw_path.replace("%20", " "));
|
||||
let content = std::fs::read_to_string(note_path)?;
|
||||
json_response(
|
||||
200,
|
||||
&serde_json::to_string(&serde_json::json!({ "content": content }))?,
|
||||
)
|
||||
} else {
|
||||
json_response(404, r#"{"error":"not found"}"#)
|
||||
};
|
||||
|
||||
stream.write_all(response.as_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn json_response(status: u16, body: &str) -> String {
|
||||
let label = match status {
|
||||
200 => "OK",
|
||||
404 => "Not Found",
|
||||
_ => "OK",
|
||||
};
|
||||
format!(
|
||||
"HTTP/1.1 {status} {label}\r\nContent-Type: application/json\r\nContent-Length: {}\r\n\r\n{}",
|
||||
body.len(),
|
||||
body
|
||||
)
|
||||
}
|
||||
|
||||
fn json_field(body: &str, field: &str) -> Option<String> {
|
||||
let value = serde_json::from_str::<serde_json::Value>(body).ok()?;
|
||||
value.get(field).map(|raw| {
|
||||
raw.as_str()
|
||||
.map(ToString::to_string)
|
||||
.unwrap_or_else(|| raw.to_string())
|
||||
.trim_matches('"')
|
||||
.to_string()
|
||||
})
|
||||
}
|
||||
108
src/vault.rs
108
src/vault.rs
|
|
@ -60,18 +60,6 @@ pub struct IndexWriteSummary {
|
|||
pub links_written: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct SearchResult {
|
||||
pub note: NoteMetadata,
|
||||
pub score: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize)]
|
||||
pub struct ContextBundle {
|
||||
pub query: String,
|
||||
pub sources: Vec<SearchResult>,
|
||||
}
|
||||
|
||||
impl VaultIndex {
|
||||
pub fn scan(config: &Config) -> Result<Self> {
|
||||
let vault_path = config
|
||||
|
|
@ -146,91 +134,6 @@ impl VaultIndex {
|
|||
..self.summary()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn search(&self, query: &str, limit: usize) -> Vec<SearchResult> {
|
||||
let terms = query_terms(query);
|
||||
let mut results: Vec<_> = self
|
||||
.notes
|
||||
.iter()
|
||||
.filter_map(|note| {
|
||||
let haystack = format!(
|
||||
"{} {} {}",
|
||||
note.path.display(),
|
||||
note.title,
|
||||
note.blocks
|
||||
.iter()
|
||||
.map(|block| block.text.as_str())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
)
|
||||
.to_lowercase();
|
||||
let score = terms
|
||||
.iter()
|
||||
.filter(|term| haystack.contains(term.as_str()))
|
||||
.count();
|
||||
(score > 0).then(|| SearchResult {
|
||||
note: note.clone(),
|
||||
score,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
results.sort_by(|a, b| {
|
||||
b.score
|
||||
.cmp(&a.score)
|
||||
.then_with(|| a.note.path.cmp(&b.note.path))
|
||||
});
|
||||
results.truncate(limit);
|
||||
results
|
||||
}
|
||||
|
||||
pub fn context_bundle(&self, query: &str, limit: usize) -> ContextBundle {
|
||||
ContextBundle {
|
||||
query: query.to_string(),
|
||||
sources: self.search(query, limit),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ContextBundle {
|
||||
pub fn to_markdown(&self) -> String {
|
||||
let mut out = format!("# Glassmind Context\n\nQuery: `{}`\n\n", self.query);
|
||||
if self.sources.is_empty() {
|
||||
out.push_str("No matching markdown notes were found.\n");
|
||||
return out;
|
||||
}
|
||||
|
||||
out.push_str("## Sources\n\n");
|
||||
for (idx, result) in self.sources.iter().enumerate() {
|
||||
out.push_str(&format!(
|
||||
"{}. `{}` - score {}\n",
|
||||
idx + 1,
|
||||
result.note.path.display(),
|
||||
result.score
|
||||
));
|
||||
out.push_str(&format!(" - title: {}\n", result.note.title));
|
||||
if !result.note.headings.is_empty() {
|
||||
out.push_str(&format!(
|
||||
" - headings: {}\n",
|
||||
result.note.headings.join(" > ")
|
||||
));
|
||||
}
|
||||
if !result.note.wikilinks.is_empty() {
|
||||
let links = result
|
||||
.note
|
||||
.wikilinks
|
||||
.iter()
|
||||
.map(|link| match &link.alias {
|
||||
Some(alias) => format!("{} as {}", link.target, alias),
|
||||
None => link.target.clone(),
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ");
|
||||
out.push_str(&format!(" - wikilinks: {links}\n"));
|
||||
}
|
||||
}
|
||||
out
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for IndexSummary {
|
||||
|
|
@ -344,14 +247,3 @@ fn is_markdown(path: &Path) -> bool {
|
|||
.and_then(|extension| extension.to_str())
|
||||
.is_some_and(|extension| extension.eq_ignore_ascii_case("md"))
|
||||
}
|
||||
|
||||
fn query_terms(query: &str) -> Vec<String> {
|
||||
query
|
||||
.split_whitespace()
|
||||
.map(|term| {
|
||||
term.trim_matches(|c: char| !c.is_alphanumeric())
|
||||
.to_lowercase()
|
||||
})
|
||||
.filter(|term| !term.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user