Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,7 @@ OpenKB settings are initialized by `openkb init` and stored in `.openkb/config.y
model: gpt-5.4 # LLM model (any LiteLLM-supported provider)
language: en # Wiki output language
pageindex_threshold: 20 # PDF pages threshold for PageIndex
file_processing_jobs: 2 # Files to prepare concurrently during `openkb add <dir>`
```

Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/providers) (OpenAI models can omit the prefix):
Expand All @@ -372,6 +373,8 @@ Model names use `provider/model` LiteLLM [format](https://docs.litellm.ai/docs/p
<summary><i>Advanced options (<code>entity_types</code>, OAuth):</i></summary>
<br>

`file_processing_jobs` (default `2`): number of files prepared concurrently during `openkb add <dir>`. Only the preparation stage is parallelized (hashing, duplicate prefiltering, raw/source staging, conversion); live-KB mutation stays serialized under the mutation lock, so raising it helps mainly when conversion is the bottleneck.

`entity_types` (optional): a YAML list overriding the entity-type vocabulary used for entity pages; omit it to use the default `person`, `organization`, `place`, `product`, `work`, `event`, `other`.

Subscription-based providers that authenticate via OAuth device flow (e.g. `chatgpt/*`, `github_copilot/*`) need no API key; OpenKB skips the missing-key warning for them.
Expand Down
4 changes: 4 additions & 0 deletions config.yaml.example
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
model: gpt-5.4 # LLM model (any LiteLLM-supported provider)
language: en # Wiki output language
pageindex_threshold: 20 # PDF pages threshold for PageIndex
file_processing_jobs: 2 # Number of files to prepare concurrently during `openkb add <dir>`
# Note: this parallelizes hashing/conversion/staging only. Live KB publish,
# PageIndex indexing, LLM compilation, registry updates, and log writes remain
# serialized under the KB mutation lock.

# Optional: override the entity-type vocabulary used for entity pages.
# Omit this key to use the default 7 types
Expand Down
29 changes: 15 additions & 14 deletions openkb/agent/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
resolve_entity_types,
)
from openkb.lint import list_existing_wiki_targets, strip_ghost_wikilinks
from openkb.locks import atomic_write_text
from openkb.schema import INDEX_SEED, get_agents_md

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -779,7 +780,7 @@ def _write_summary(wiki_dir: Path, doc_name: str, summary: str,
fm_lines.append(f"doc_type: {doc_type}")
fm_lines.append(_yaml_kv_line("full_text", f"sources/{doc_name}.{ext}"))
fm_block = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
(summaries_dir / f"{doc_name}.md").write_text(fm_block + summary, encoding="utf-8")
atomic_write_text(summaries_dir / f"{doc_name}.md", fm_block + summary)


_SAFE_NAME_RE = re.compile(r'[^\w\-]')
Expand Down Expand Up @@ -839,7 +840,7 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
if brief:
fm_lines.append(_yaml_kv_line("description", brief))
existing = frontmatter.block(fm_lines) + clean
path.write_text(existing, encoding="utf-8")
atomic_write_text(path, existing)
return
# Guarantee type + refresh description on update; remove legacy brief:.
ex_parts2 = frontmatter.split(existing)
Expand All @@ -851,7 +852,7 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
# Drop legacy brief: lines (migrated to description:).
fm_block = frontmatter.drop_line(fm_block, "brief")
existing = fm_block + body
path.write_text(existing, encoding="utf-8")
atomic_write_text(path, existing)
else:
clean_parts = frontmatter.split(content)
if clean_parts is not None:
Expand All @@ -863,7 +864,7 @@ def _write_concept(wiki_dir: Path, name: str, content: str, source_file: str, is
if brief:
fm_lines.append(_yaml_kv_line("description", brief))
fm_block = "---\n" + "\n".join(fm_lines) + "\n---\n\n"
path.write_text(fm_block + content, encoding="utf-8")
atomic_write_text(path, fm_block + content)


def _write_entity(
Expand Down Expand Up @@ -927,10 +928,10 @@ def _build_entity_frontmatter(sources: list[str]) -> str:
break
merged = [source_file] + [s for s in recovered if s != source_file]
existing = _build_entity_frontmatter(merged) + clean
path.write_text(existing, encoding="utf-8")
atomic_write_text(path, existing)
return

path.write_text(_build_entity_frontmatter([source_file]) + clean, encoding="utf-8")
atomic_write_text(path, _build_entity_frontmatter([source_file]) + clean)


_set_fm_line = frontmatter.set_line
Expand Down Expand Up @@ -1041,7 +1042,7 @@ def _add_related_link(
text = _prepend_source_to_frontmatter(text, source_file)

text += f"\n\nSee also: {link}"
path.write_text(text, encoding="utf-8")
atomic_write_text(path, text)
return True


Expand All @@ -1068,7 +1069,7 @@ def _backlink_summary_pages(
_ensure_h2_section(lines, section, quiet=True)
for slug in reversed(missing):
_insert_section_entry(lines, section, f"- [[{page_dir}/{slug}]]")
summary_path.write_text("\n".join(lines), encoding="utf-8")
atomic_write_text(summary_path, "\n".join(lines))


def _backlink_pages(
Expand All @@ -1089,7 +1090,7 @@ def _backlink_pages(
lines = text.split("\n")
_ensure_h2_section(lines, "## Related Documents", quiet=True)
_insert_section_entry(lines, "## Related Documents", f"- {link}")
path.write_text("\n".join(lines), encoding="utf-8")
atomic_write_text(path, "\n".join(lines))


def _backlink_summary(wiki_dir: Path, doc_name: str, concept_slugs: list[str]) -> None:
Expand Down Expand Up @@ -1195,7 +1196,7 @@ def _remove_doc_from_pages(
path.unlink()
deleted.append(path.stem)
elif new_text != text:
path.write_text(new_text, encoding="utf-8")
atomic_write_text(path, new_text)
modified.append(path.stem)

return {"modified": modified, "deleted": deleted}
Expand Down Expand Up @@ -1291,7 +1292,7 @@ def remove_doc_from_index(wiki_dir: Path, doc_name: str, concept_slugs_deleted:
while _remove_section_entry(lines, "## Entities", entity_link):
pass

index_path.write_text("\n".join(lines), encoding="utf-8")
atomic_write_text(index_path, "\n".join(lines))


def _update_index(
Expand All @@ -1315,7 +1316,7 @@ def _update_index(

index_path = wiki_dir / "index.md"
if not index_path.exists():
index_path.write_text(INDEX_SEED, encoding="utf-8")
atomic_write_text(index_path, INDEX_SEED)

lines = index_path.read_text(encoding="utf-8").split("\n")

Expand Down Expand Up @@ -1361,7 +1362,7 @@ def _update_index(
else:
_insert_section_entry(lines, "## Entities", entry)

index_path.write_text("\n".join(lines), encoding="utf-8")
atomic_write_text(index_path, "\n".join(lines))


# ---------------------------------------------------------------------------
Expand Down Expand Up @@ -2035,7 +2036,7 @@ async def compile_long_doc(
updated = fm_block + body
if updated != summary_content:
summary_content = updated
summary_path.write_text(summary_content, encoding="utf-8")
atomic_write_text(summary_path, summary_content)

# Base context A. cache_control marker on the doc message creates a
# cache breakpoint covering (system + doc) for every concept call.
Expand Down
Loading