From a6ad3d471f0a125e703593bada94e6fe684ef0ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 19 Jun 2024 00:06:11 +0200 Subject: [PATCH 01/29] script.sh: short-circuit find-file-doc-comments.pl using an heuristic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid calling this parse-docs script that is expensive. This heuristic avoids running it on most files, and is almost free. Signed-off-by: Théo Lebrun --- script.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/script.sh b/script.sh index 3bbff2a7..6c77ec72 100755 --- a/script.sh +++ b/script.sh @@ -204,7 +204,13 @@ parse_docs() tmpfile=`mktemp` git cat-file blob "$opt1" > "$tmpfile" - "$script_dir/find-file-doc-comments.pl" "$tmpfile" || exit "$?" + + # Shortcut: if '/**' isn't present in the file, it cannot contain a doc. + # This avoids calling find-file-doc-comments.pl on most files, which is an + # expensive operation. + if grep -qF '/**' "$tmpfile"; then + "$script_dir/find-file-doc-comments.pl" "$tmpfile" || exit "$?" + fi rm -rf "$tmpfile" } From eb368501f87fbc146fe8e66d9fc2cdb7226de907 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Fri, 11 Oct 2024 20:15:17 +0200 Subject: [PATCH 02/29] script.sh: do not sort ctags output By default ctags sorts entries. This is not useful to the update script, but takes time. user time for `update.py 16` on musl v1.2.5 went from 1m21.613s to 1m11.849s. --- script.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/script.sh b/script.sh index 6c77ec72..656a2633 100755 --- a/script.sh +++ b/script.sh @@ -165,7 +165,7 @@ parse_defs_C() git cat-file blob "$opt1" > "$full_path" # Use ctags to parse most of the defs - ctags -x --kinds-c=+p+x --extras='-{anonymous}' "$full_path" | + ctags -u -x --kinds-c=+p+x --extras='-{anonymous}' "$full_path" | grep -avE -e "^operator " -e "^CONFIG_" | awk '{print $1" "$2" "$3}' @@ -182,7 +182,7 @@ parse_defs_K() tmp=`mktemp -d` full_path=$tmp/$opt2 git cat-file blob "$opt1" > "$full_path" - ctags -x --language-force=kconfig --kinds-kconfig=c --extras-kconfig=-{configPrefixed} "$full_path" | + ctags -u -x --language-force=kconfig --kinds-kconfig=c --extras-kconfig=-{configPrefixed} "$full_path" | awk '{print "CONFIG_"$1" "$2" "$3}' rm "$full_path" rmdir $tmp @@ -193,7 +193,7 @@ parse_defs_D() tmp=`mktemp -d` full_path=$tmp/$opt2 git cat-file blob "$opt1" > "$full_path" - ctags -x --language-force=dts "$full_path" | + ctags -u -x --language-force=dts "$full_path" | awk '{print $1" "$2" "$3}' rm "$full_path" rmdir $tmp From 2b17fbc34895263c7dd19a657eab245ad46eb919 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sat, 21 Dec 2024 20:12:58 +0100 Subject: [PATCH 03/29] update: Rewrite update script New update script divides work into tasks scheduled between a constant number of processes, instead of statically assigning a single long running task to each thread. This results in better CPU saturation. Database handles are not shared between threads anymore, instead the main thread is used to commit results of other processes into the database. This trades locking on database access for serialization costs - since multiprocessing is used, values returned from futures are pickled. --- elixir/data.py | 51 ++++++-- elixir/lib.py | 3 +- elixir/update.py | 329 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 370 insertions(+), 13 deletions(-) create mode 100644 elixir/update.py diff --git a/elixir/data.py b/elixir/data.py index b952943c..54a0796c 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -21,10 +21,16 @@ import berkeleydb import re from . import lib +from .lib import autoBytes import os import os.path import errno +# Cache size used by the update script for the largest databases. Tuple of (gigabytes, bytes). +# https://docs.oracle.com/database/bdb181/html/api_reference/C/dbset_cachesize.html +# https://docs.oracle.com/database/bdb181/html/programmer_reference/general_am_conf.html#am_conf_cachesize +CACHESIZE = (2,0) + deflist_regex = re.compile(b'(\d*)(\w)(\d*)(\w),?') deflist_macro_regex = re.compile('\dM\d+(\w)') @@ -72,6 +78,14 @@ def iter(self, dummy=False): if dummy: yield maxId, None, None, None + def exists(self, idx, line_num): + entries = deflist_regex.findall(self.data) + for id, _, line, _ in entries: + if id == idx and int(line) == line_num: + return True + + return False + def append(self, id, type, line, family): if type not in defTypeD: return @@ -145,11 +159,14 @@ def pack(self): return self.data class BsdDB: - def __init__(self, filename, readonly, contentType, shared=False): + def __init__(self, filename, readonly, contentType, shared=False, cachesize=None): self.filename = filename self.db = berkeleydb.db.DB() flags = berkeleydb.db.DB_THREAD if shared else 0 + if cachesize is not None: + self.db.set_cachesize(cachesize[0], cachesize[1]) + if readonly: flags |= berkeleydb.db.DB_RDONLY self.db.open(filename, flags=flags) @@ -159,26 +176,32 @@ def __init__(self, filename, readonly, contentType, shared=False): self.ctype = contentType def exists(self, key): - key = lib.autoBytes(key) + key = autoBytes(key) return self.db.exists(key) def get(self, key): - key = lib.autoBytes(key) + key = autoBytes(key) p = self.db.get(key) - return self.ctype(p) if p is not None else None + if p is None: + return None + p = self.ctype(p) + return p def get_keys(self): return self.db.keys() def put(self, key, val, sync=False): - key = lib.autoBytes(key) - val = lib.autoBytes(val) + key = autoBytes(key) + val = autoBytes(val) if type(val) is not bytes: val = val.pack() self.db.put(key, val) if sync: self.db.sync() + def sync(self): + self.db.sync() + def close(self): self.db.close() @@ -186,13 +209,17 @@ def __len__(self): return self.db.stat()["nkeys"] class DB: - def __init__(self, dir, readonly=True, dtscomp=False, shared=False): + def __init__(self, dir, readonly=True, dtscomp=False, shared=False, update_cache=False): if os.path.isdir(dir): self.dir = dir else: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dir) ro = readonly + cachesize = None + + if update_cache: + cachesize = CACHESIZE self.vars = BsdDB(dir + '/variables.db', ro, lambda x: int(x.decode()), shared=shared) # Key-value store of basic information @@ -203,7 +230,7 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False): self.file = BsdDB(dir + '/filenames.db', ro, lambda x: x.decode(), shared=shared) # Map serial number to filename self.vers = BsdDB(dir + '/versions.db', ro, PathList, shared=shared) - self.defs = BsdDB(dir + '/definitions.db', ro, DefList, shared=shared) + self.defs = BsdDB(dir + '/definitions.db', ro, DefList, shared=shared, cachesize=cachesize) self.defs_cache = {} NOOP = lambda x: x self.defs_cache['C'] = BsdDB(dir + '/definitions-cache-C.db', ro, NOOP, shared=shared) @@ -211,12 +238,12 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False): self.defs_cache['D'] = BsdDB(dir + '/definitions-cache-D.db', ro, NOOP, shared=shared) self.defs_cache['M'] = BsdDB(dir + '/definitions-cache-M.db', ro, NOOP, shared=shared) assert sorted(self.defs_cache.keys()) == sorted(lib.CACHED_DEFINITIONS_FAMILIES) - self.refs = BsdDB(dir + '/references.db', ro, RefList, shared=shared) - self.docs = BsdDB(dir + '/doccomments.db', ro, RefList, shared=shared) + self.refs = BsdDB(dir + '/references.db', ro, RefList, shared=shared, cachesize=cachesize) + self.docs = BsdDB(dir + '/doccomments.db', ro, RefList, shared=shared, cachesize=cachesize) self.dtscomp = dtscomp if dtscomp: - self.comps = BsdDB(dir + '/compatibledts.db', ro, RefList, shared=shared) - self.comps_docs = BsdDB(dir + '/compatibledts_docs.db', ro, RefList, shared=shared) + self.comps = BsdDB(dir + '/compatibledts.db', ro, RefList, shared=shared, cachesize=cachesize) + self.comps_docs = BsdDB(dir + '/compatibledts_docs.db', ro, RefList, shared=shared, cachesize=cachesize) # Use a RefList in case there are multiple doc comments for an identifier def close(self): diff --git a/elixir/lib.py b/elixir/lib.py index 7d7d0757..2442e107 100755 --- a/elixir/lib.py +++ b/elixir/lib.py @@ -21,6 +21,7 @@ import sys import logging import subprocess, os +from typing import List logger = logging.getLogger(__name__) @@ -46,7 +47,7 @@ def run_cmd(*args, env=None): # Invoke ./script.sh with the given arguments # Returns the list of output lines -def scriptLines(*args, env=None): +def scriptLines(*args, env=None) -> List[bytes]: p = script(*args, env=env) p = p.split(b'\n') del p[-1] diff --git a/elixir/update.py b/elixir/update.py new file mode 100644 index 00000000..cbbde79a --- /dev/null +++ b/elixir/update.py @@ -0,0 +1,329 @@ +import logging +from multiprocessing import cpu_count +from multiprocessing.pool import Pool +from typing import Dict, Iterable, List, Optional, Tuple + +from find_compatible_dts import FindCompatibleDTS + +from elixir.data import DB, BsdDB, DefList, PathList, RefList +from elixir.lib import ( + compatibleFamily, + compatibleMacro, + getDataDir, + getFileFamily, + isIdent, + script, + scriptLines, +) + +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s') +logger = logging.getLogger(__name__) + +# File identification - id, hash, filename +FileId = Tuple[int, bytes, str] + +# Definitions parsing output, ident -> list of (file_idx, type, line, family) +DefsDict = Dict[bytes, List[Tuple[int, str, int, str]]] + +# References parsing output, ident -> (file_idx, family) -> list of lines +RefsDict = Dict[bytes, Dict[Tuple[int, str], List[int]]] + +# Cache of definitions found in current tag, ident -> list of (file_idx, line) +DefCache = Dict[bytes, List[Tuple[int, int]]] + +# Generic dictionary of ident -> list of lines +LinesListDict = Dict[str, List[int]] + +# Add definitions to database +def add_defs(db: DB, def_cache: DefCache, defs: DefsDict): + for ident, occ_list in defs.items(): + obj = db.defs.get(ident) + if obj is None: + obj = DefList() + + if ident in def_cache: + lines_list = def_cache[ident] + else: + lines_list = [] + def_cache[ident] = lines_list + + for (idx, type, line, family) in occ_list: + obj.append(idx, type, line, family) + lines_list.append((idx, line)) + + db.defs.put(ident, obj) + +# Add references to database +def add_refs(db: DB, def_cache: DefCache, refs: RefsDict): + for ident, idx_to_lines in refs.items(): + # Skip reference if definition was not collected in this tag + deflist = def_cache.get(ident) + if deflist is None: + continue + + def deflist_exists(idx, n): + for didx, dn in deflist: + if didx == idx and dn == n: + return True + return False + + obj = db.refs.get(ident) + if obj is None: + obj = RefList() + + for (idx, family), lines in idx_to_lines.items(): + lines = [n for n in lines if not deflist_exists(str(idx).encode(), n)] + + if len(lines) != 0: + lines_str = ','.join((str(n) for n in lines)) + obj.append(idx, lines_str, family) + + db.refs.put(ident, obj) + +# Add documentation references to database +def add_docs(db: DB, idx: int, family: str, docs: Dict[str, List[int]]): + add_to_lineslist(db.docs, idx, family, docs) + +# Add compatible references to database +def add_comps(db: DB, idx: int, family: str, comps: Dict[str, List[int]]): + add_to_lineslist(db.comps, idx, family, comps) + +# Add compatible docs to database +def add_comps_docs(db: DB, idx: int, family: str, comps_docs: Dict[str, List[int]]): + comps_result = {} + for ident, v in comps_docs.items(): + if db.comps.exists(ident): + comps_result[ident] = v + + add_to_lineslist(db.comps_docs, idx, family, comps_result) + +# Add data to a database file that uses lines list schema +def add_to_lineslist(db_file: BsdDB, idx: int, family: str, to_add: Dict[str, List[int]]): + for ident, lines in to_add.items(): + obj = db_file.get(ident) + if obj is None: + obj = RefList() + + lines_str = ','.join((str(n) for n in lines)) + obj.append(idx, lines_str, family) + db_file.put(ident, obj) + + +# Adds blob list to database, returns blob id -> (hash, filename) dict +def collect_blobs(db: DB, tag: bytes) -> Dict[int, Tuple[bytes, str]]: + idx = db.vars.get('numBlobs') + if idx is None: + idx = 0 + + # Get blob hashes and associated file names (without path) + blobs = scriptLines('list-blobs', '-f', tag) + versionBuf = [] + idx_to_hash_and_filename = {} + + # Collect new blobs, assign database ids to the blobs + for blob in blobs: + hash, filename = blob.split(b' ',maxsplit=1) + blob_exist = db.blob.exists(hash) + versionBuf.append((idx, filename)) + if not blob_exist: + idx_to_hash_and_filename[idx] = (hash, filename.decode()) + db.blob.put(hash, idx) + db.hash.put(idx, hash) + db.file.put(idx, filename) + idx += 1 + + # Update number of blobs in the database + db.vars.put('numBlobs', idx) + + # Add mapping blob id -> path to version database + versionBuf.sort() + obj = PathList() + for idx, path in versionBuf: + obj.append(idx, path) + db.vers.put(tag, obj, sync=True) + + return idx_to_hash_and_filename + +# Generate definitions cache databases +def generate_defs_caches(db: DB): + for key in db.defs.get_keys(): + value = db.defs.get(key) + for family in ['C', 'K', 'D', 'M']: + if (compatibleFamily(value.get_families(), family) or + compatibleMacro(value.get_macros(), family)): + db.defs_cache[family].put(key, b'') + + +# Collect definitions from ctags for a file +def get_defs(file_id: FileId) -> Optional[DefsDict]: + idx, hash, filename = file_id + defs = {} + family = getFileFamily(filename) + if family in (None, 'M'): + return None + + lines = scriptLines('parse-defs', hash, filename, family) + + for l in lines: + ident, type, line = l.split(b' ') + type = type.decode() + line = int(line.decode()) + if isIdent(ident): + if ident not in defs: + defs[ident] = [] + defs[ident].append((idx, type, line, family)) + + return defs + +# Collect references from the tokenizer for a file +def get_refs(file_id: FileId) -> Optional[RefsDict]: + idx, hash, filename = file_id + refs = {} + family = getFileFamily(filename) + if family is None: + return + + # Kconfig values are saved as CONFIG_ + prefix = b'' if family != 'K' else b'CONFIG_' + + tokens = scriptLines('tokenize-file', '-b', hash, family) + even = True + line_num = 1 + + for tok in tokens: + even = not even + if even: + tok = prefix + tok + + # We only index CONFIG_??? in makefiles + if (family != 'M' or tok.startswith(b'CONFIG_')): + if tok not in refs: + refs[tok] = {} + + if (idx, family) not in refs[tok]: + refs[tok][(idx, family)] = [] + + refs[tok][(idx, family)].append(line_num) + + else: + line_num += tok.count(b'\1') + + return refs + +# Collect compatible script output into lineslinst-schema compatible format +def collect_get_blob_output(lines: Iterable[str]) -> LinesListDict: + results = {} + for l in lines: + ident, line = l.split(' ') + line = int(line) + + if ident not in results: + results[ident] = [] + results[ident].append(line) + + return results + +# Collect docs from doc comments script for a single file +def get_docs(file_id: FileId) -> Optional[Tuple[int, str, LinesListDict]]: + idx, hash, filename = file_id + family = getFileFamily(filename) + if family in (None, 'M'): return + + lines = (line.decode() for line in scriptLines('parse-docs', hash, filename)) + docs = collect_get_blob_output(lines) + + return (idx, family, docs) + +# Collect compatible references for a single file +def get_comps(file_id: FileId) -> Optional[Tuple[int, str, LinesListDict]]: + idx, hash, filename = file_id + family = getFileFamily(filename) + if family in (None, 'K', 'M'): return + + compatibles_parser = FindCompatibleDTS() + lines = compatibles_parser.run(scriptLines('get-blob', hash), family) + comps = collect_get_blob_output(lines) + + return (idx, family, comps) + +# Collect compatible documentation references for a single file +def get_comps_docs(file_id: FileId) -> Optional[Tuple[int, str, LinesListDict]]: + idx, hash, _ = file_id + family = 'B' + + compatibles_parser = FindCompatibleDTS() + lines = compatibles_parser.run(scriptLines('get-blob', hash), family) + comps_docs = {} + for l in lines: + ident, line = l.split(' ') + + if ident not in comps_docs: + comps_docs[ident] = [] + comps_docs[ident].append(int(line)) + + return (idx, family, comps_docs) + + +# Update a single version - collects data from all the stages and saves it in the database +def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): + idx_to_hash_and_filename = collect_blobs(db, tag) + def_cache = {} + + # Collect blobs to process and split list of blobs into chunks + idxes = [(idx, hash, filename) for (idx, (hash, filename)) in idx_to_hash_and_filename.items()] + chunksize = int(len(idxes) / cpu_count()) + chunksize = min(max(1, chunksize), 100) + + collect_blobs(db, tag) + logger.info("collecting blobs done") + + for result in pool.imap_unordered(get_defs, idxes, chunksize): + if result is not None: + add_defs(db, def_cache, result) + + logger.info("defs done") + + for result in pool.imap_unordered(get_docs, idxes, chunksize): + if result is not None: + add_docs(db, *result) + + logger.info("docs done") + + if dts_comp_support: + for result in pool.imap_unordered(get_comps, idxes, chunksize): + if result is not None: + add_comps(db, *result) + + logger.info("dts comps done") + + for result in pool.imap_unordered(get_comps_docs, idxes, chunksize): + if result is not None: + add_comps_docs(db, *result) + + logger.info("dts comps docs done") + + for result in pool.imap_unordered(get_refs, idxes, chunksize): + if result is not None: + add_refs(db, def_cache, result) + + logger.info("refs done") + + generate_defs_caches(db) + logger.info("update done") + + +if __name__ == "__main__": + dts_comp_support = bool(int(script('dts-comp'))) + db = None + + with Pool() as pool: + for tag in scriptLines('list-tags'): + if db is None: + db = DB(getDataDir(), readonly=False, dtscomp=dts_comp_support, shared=False, update_cache=True) + + if not db.vers.exists(tag): + logger.info("updating tag %s", tag) + update_version(db, tag, pool, dts_comp_support) + db.close() + db = None + From 265fcdbf974aa6024ee305a1489e739eac73c4a2 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 29 Dec 2024 21:22:22 +0100 Subject: [PATCH 04/29] update: Remove old update script --- update.py | 638 ------------------------------------------------------ 1 file changed, 638 deletions(-) delete mode 100755 update.py diff --git a/update.py b/update.py deleted file mode 100755 index 9d84ff31..00000000 --- a/update.py +++ /dev/null @@ -1,638 +0,0 @@ -#!/usr/bin/env python3 - -# This file is part of Elixir, a source code cross-referencer. -# -# Copyright (C) 2017--2020 Mikaël Bouillot -# Maxime Chretien -# and contributors -# -# Elixir is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Elixir is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with Elixir. If not, see . - -# Throughout, an "idx" is the sequential number associated with a blob. -# This is different from that blob's Git hash. - -from sys import argv -from threading import Thread, Lock, Event, Condition - -import elixir.lib as lib -from elixir.lib import script, scriptLines -import elixir.data as data -from elixir.data import PathList -from find_compatible_dts import FindCompatibleDTS - -verbose = False - -dts_comp_support = int(script('dts-comp')) - -compatibles_parser = FindCompatibleDTS() - -db = data.DB(lib.getDataDir(), readonly=False, shared=True, dtscomp=dts_comp_support) - -# Number of cpu threads (+2 for version indexing) -cpu = 10 -threads_list = [] - -hash_file_lock = Lock() # Lock for db.hash and db.file -blobs_lock = Lock() # Lock for db.blobs -defs_lock = Lock() # Lock for db.defs -refs_lock = Lock() # Lock for db.refs -docs_lock = Lock() # Lock for db.docs -comps_lock = Lock() # Lock for db.comps -comps_docs_lock = Lock() # Lock for db.comps_docs -tag_ready = Condition() # Waiting for new tags - -new_idxes = [] # (new idxes, Event idxes ready, Event defs ready, Event comps ready, Event vers ready) -bindings_idxes = [] # DT bindings documentation files -idx_key_mod = 1000000 -defs_idxes = {} # Idents definitions stored with (idx*idx_key_mod + line) as the key. - -tags_done = False # True if all tags have been added to new_idxes - -# Progress variables [tags, finished threads] -tags_defs = [0, 0] -tags_defs_lock = Lock() -tags_refs = [0, 0] -tags_refs_lock = Lock() -tags_docs = [0, 0] -tags_docs_lock = Lock() -tags_comps = [0, 0] -tags_comps_lock = Lock() -tags_comps_docs = [0, 0] -tags_comps_docs_lock = Lock() - -class UpdateIds(Thread): - def __init__(self, tag_buf): - Thread.__init__(self, name="UpdateIdsElixir") - self.tag_buf = tag_buf - - def run(self): - global new_idxes, tags_done, tag_ready - self.index = 0 - - for tag in self.tag_buf: - - new_idxes.append((self.update_blob_ids(tag), Event(), Event(), Event(), Event())) - - progress('ids: ' + tag.decode() + ': ' + str(len(new_idxes[self.index][0])) + - ' new blobs', self.index+1) - - new_idxes[self.index][1].set() # Tell that the tag is ready - - self.index += 1 - - # Wake up waiting threads - with tag_ready: - tag_ready.notify_all() - - tags_done = True - progress('ids: Thread finished', self.index) - - def update_blob_ids(self, tag): - - global hash_file_lock, blobs_lock - - if db.vars.exists('numBlobs'): - idx = db.vars.get('numBlobs') - else: - idx = 0 - - # Get blob hashes and associated file names (without path) - blobs = scriptLines('list-blobs', '-f', tag) - - new_idxes = [] - for blob in blobs: - hash, filename = blob.split(b' ',maxsplit=1) - with blobs_lock: - blob_exist = db.blob.exists(hash) - if not blob_exist: - db.blob.put(hash, idx) - - if not blob_exist: - with hash_file_lock: - db.hash.put(idx, hash) - db.file.put(idx, filename) - - new_idxes.append(idx) - if verbose: - print(f"New blob #{idx} {hash}:{filename}") - idx += 1 - db.vars.put('numBlobs', idx) - return new_idxes - - -class UpdateVersions(Thread): - def __init__(self, tag_buf): - Thread.__init__(self, name="UpdateVersionsElixir") - self.tag_buf = tag_buf - - def run(self): - global new_idxes, tag_ready - - index = 0 - - while index < len(self.tag_buf): - if index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - tag = self.tag_buf[index] - - new_idxes[index][1].wait() # Make sure the tag is ready - - self.update_versions(tag) - - new_idxes[index][4].set() # Tell that UpdateVersions processed the tag - - progress('vers: ' + tag.decode() + ' done', index+1) - - index += 1 - - progress('vers: Thread finished', index) - - def update_versions(self, tag): - global blobs_lock - - # Get blob hashes and associated file paths - blobs = scriptLines('list-blobs', '-p', tag) - buf = [] - - for blob in blobs: - hash, path = blob.split(b' ', maxsplit=1) - with blobs_lock: - idx = db.blob.get(hash) - buf.append((idx, path)) - - buf = sorted(buf) - obj = PathList() - for idx, path in buf: - obj.append(idx, path) - - # Store DT bindings documentation files to parse them later - if path[:33] == b'Documentation/devicetree/bindings': - bindings_idxes.append(idx) - - if verbose: - print(f"Tag {tag}: adding #{idx} {path}") - db.vers.put(tag, obj, sync=True) - - -def generate_defs_caches(): - for key in db.defs.get_keys(): - value = db.defs.get(key) - for family in ['C', 'K', 'D', 'M']: - if (lib.compatibleFamily(value.get_families(), family) or - lib.compatibleMacro(value.get_macros(), family)): - db.defs_cache[family].put(key, b'') - - -class UpdateDefs(Thread): - def __init__(self, start, inc): - Thread.__init__(self, name="UpdateDefsElixir") - self.index = start - self.inc = inc # Equivalent to the number of defs threads - - def run(self): - global new_idxes, tags_done, tag_ready, tags_defs, tags_defs_lock - - while not (tags_done and self.index >= len(new_idxes)): - if self.index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - new_idxes[self.index][1].wait() # Make sure the tag is ready - - with tags_defs_lock: - tags_defs[0] += 1 - - self.update_definitions(new_idxes[self.index][0]) - - new_idxes[self.index][2].set() # Tell that UpdateDefs processed the tag - - self.index += self.inc - - with tags_defs_lock: - tags_defs[1] += 1 - progress('defs: Thread ' + str(tags_defs[1]) + '/' + str(self.inc) + ' finished', tags_defs[0]) - - - def update_definitions(self, idxes): - global hash_file_lock, defs_lock, tags_defs - - for idx in idxes: - if idx % 1000 == 0: progress('defs: ' + str(idx), tags_defs[0]) - - with hash_file_lock: - hash = db.hash.get(idx) - filename = db.file.get(idx) - - family = lib.getFileFamily(filename) - if family in [None, 'M']: continue - - lines = scriptLines('parse-defs', hash, filename, family) - - with defs_lock: - for l in lines: - ident, type, line = l.split(b' ') - type = type.decode() - line = int(line.decode()) - - defs_idxes[idx*idx_key_mod + line] = ident - - if db.defs.exists(ident): - obj = db.defs.get(ident) - elif lib.isIdent(ident): - obj = data.DefList() - else: - continue - - obj.append(idx, type, line, family) - if verbose: - print(f"def {type} {ident} in #{idx} @ {line}") - db.defs.put(ident, obj) - - generate_defs_caches() - - -class UpdateRefs(Thread): - def __init__(self, start, inc): - Thread.__init__(self, name="UpdateRefsElixir") - self.index = start - self.inc = inc # Equivalent to the number of refs threads - - def run(self): - global new_idxes, tags_done, tags_refs, tags_refs_lock - - while not (tags_done and self.index >= len(new_idxes)): - if self.index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - new_idxes[self.index][1].wait() # Make sure the tag is ready - new_idxes[self.index][2].wait() # Make sure UpdateDefs processed the tag - - with tags_refs_lock: - tags_refs[0] += 1 - - self.update_references(new_idxes[self.index][0]) - - self.index += self.inc - - with tags_refs_lock: - tags_refs[1] += 1 - progress('refs: Thread ' + str(tags_refs[1]) + '/' + str(self.inc) + ' finished', tags_refs[0]) - - def update_references(self, idxes): - global hash_file_lock, defs_lock, refs_lock, tags_refs - - for idx in idxes: - if idx % 1000 == 0: progress('refs: ' + str(idx), tags_refs[0]) - - with hash_file_lock: - hash = db.hash.get(idx) - filename = db.file.get(idx) - - family = lib.getFileFamily(filename) - if family == None: continue - - prefix = b'' - # Kconfig values are saved as CONFIG_ - if family == 'K': - prefix = b'CONFIG_' - - tokens = scriptLines('tokenize-file', '-b', hash, family) - even = True - line_num = 1 - idents = {} - with defs_lock: - for tok in tokens: - even = not even - if even: - tok = prefix + tok - - if (db.defs.exists(tok) and - not ( (idx*idx_key_mod + line_num) in defs_idxes and - defs_idxes[idx*idx_key_mod + line_num] == tok ) and - (family != 'M' or tok.startswith(b'CONFIG_'))): - # We only index CONFIG_??? in makefiles - if tok in idents: - idents[tok] += ',' + str(line_num) - else: - idents[tok] = str(line_num) - - else: - line_num += tok.count(b'\1') - - with refs_lock: - for ident, lines in idents.items(): - if db.refs.exists(ident): - obj = db.refs.get(ident) - else: - obj = data.RefList() - - obj.append(idx, lines, family) - if verbose: - print(f"ref: {ident} in #{idx} @ {lines}") - db.refs.put(ident, obj) - - -class UpdateDocs(Thread): - def __init__(self, start, inc): - Thread.__init__(self, name="UpdateDocsElixir") - self.index = start - self.inc = inc # Equivalent to the number of docs threads - - def run(self): - global new_idxes, tags_done, tags_docs, tags_docs_lock - - while not (tags_done and self.index >= len(new_idxes)): - if self.index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - new_idxes[self.index][1].wait() # Make sure the tag is ready - - with tags_docs_lock: - tags_docs[0] += 1 - - self.update_doc_comments(new_idxes[self.index][0]) - - self.index += self.inc - - with tags_docs_lock: - tags_docs[1] += 1 - progress('docs: Thread ' + str(tags_docs[1]) + '/' + str(self.inc) + ' finished', tags_docs[0]) - - def update_doc_comments(self, idxes): - global hash_file_lock, docs_lock, tags_docs - - for idx in idxes: - if idx % 1000 == 0: progress('docs: ' + str(idx), tags_docs[0]) - - with hash_file_lock: - hash = db.hash.get(idx) - filename = db.file.get(idx) - - family = lib.getFileFamily(filename) - if family in [None, 'M']: continue - - lines = scriptLines('parse-docs', hash, filename) - with docs_lock: - for l in lines: - ident, line = l.split(b' ') - line = int(line.decode()) - - if db.docs.exists(ident): - obj = db.docs.get(ident) - else: - obj = data.RefList() - - obj.append(idx, str(line), family) - if verbose: - print(f"doc: {ident} in #{idx} @ {line}") - db.docs.put(ident, obj) - - -class UpdateComps(Thread): - def __init__(self, start, inc): - Thread.__init__(self, name="UpdateCompsElixir") - self.index = start - self.inc = inc # Equivalent to the number of comps threads - - def run(self): - global new_idxes, tags_done, tags_comps, tags_comps_lock - - while not (tags_done and self.index >= len(new_idxes)): - if self.index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - new_idxes[self.index][1].wait() # Make sure the tag is ready - - with tags_comps_lock: - tags_comps[0] += 1 - - self.update_compatibles(new_idxes[self.index][0]) - - new_idxes[self.index][3].set() # Tell that UpdateComps processed the tag - - self.index += self.inc - - with tags_comps_lock: - tags_comps[1] += 1 - progress('comps: Thread ' + str(tags_comps[1]) + '/' + str(self.inc) + ' finished', tags_comps[0]) - - def update_compatibles(self, idxes): - global hash_file_lock, comps_lock, tags_comps - - for idx in idxes: - if idx % 1000 == 0: progress('comps: ' + str(idx), tags_comps[0]) - - with hash_file_lock: - hash = db.hash.get(idx) - filename = db.file.get(idx) - - family = lib.getFileFamily(filename) - if family in [None, 'K', 'M']: continue - - lines = compatibles_parser.run(scriptLines('get-blob', hash), family) - comps = {} - for l in lines: - ident, line = l.split(' ') - - if ident in comps: - comps[ident] += ',' + str(line) - else: - comps[ident] = str(line) - - with comps_lock: - for ident, lines in comps.items(): - if db.comps.exists(ident): - obj = db.comps.get(ident) - else: - obj = data.RefList() - - obj.append(idx, lines, family) - if verbose: - print(f"comps: {ident} in #{idx} @ {line}") - db.comps.put(ident, obj) - - -class UpdateCompsDocs(Thread): - def __init__(self, start, inc): - Thread.__init__(self, name="UpdateCompsDocsElixir") - self.index = start - self.inc = inc # Equivalent to the number of comps_docs threads - - def run(self): - global new_idxes, tags_done, tags_comps_docs, tags_comps_docs_lock - - while not (tags_done and self.index >= len(new_idxes)): - if self.index >= len(new_idxes): - # Wait for new tags - with tag_ready: - tag_ready.wait() - continue - - new_idxes[self.index][1].wait() # Make sure the tag is ready - new_idxes[self.index][3].wait() # Make sure UpdateComps processed the tag - new_idxes[self.index][4].wait() # Make sure UpdateVersions processed the tag - - with tags_comps_docs_lock: - tags_comps_docs[0] += 1 - - self.update_compatibles_bindings(new_idxes[self.index][0]) - - self.index += self.inc - - with tags_comps_docs_lock: - tags_comps_docs[1] += 1 - progress('comps_docs: Thread ' + str(tags_comps_docs[1]) + '/' + str(self.inc) + ' finished', tags_comps_docs[0]) - - def update_compatibles_bindings(self, idxes): - global hash_file_lock, comps_lock, comps_docs_lock, tags_comps_docs, bindings_idxes - - for idx in idxes: - if idx % 1000 == 0: progress('comps_docs: ' + str(idx), tags_comps_docs[0]) - - if not idx in bindings_idxes: # Parse only bindings doc files - continue - - with hash_file_lock: - hash = db.hash.get(idx) - - family = 'B' - lines = compatibles_parser.run(scriptLines('get-blob', hash), family) - comps_docs = {} - with comps_lock: - for l in lines: - ident, line = l.split(' ') - - if db.comps.exists(ident): - if ident in comps_docs: - comps_docs[ident] += ',' + str(line) - else: - comps_docs[ident] = str(line) - - with comps_docs_lock: - for ident, lines in comps_docs.items(): - if db.comps_docs.exists(ident): - obj = db.comps_docs.get(ident) - else: - obj = data.RefList() - - obj.append(idx, lines, family) - if verbose: - print(f"comps_docs: {ident} in #{idx} @ {line}") - db.comps_docs.put(ident, obj) - - -def progress(msg, current): - print('{} - {} ({:.1%})'.format(project, msg, current/num_tags)) - - -# Main - -# Check number of threads arg -if len(argv) >= 2 and argv[1].isdigit() : - cpu = int(argv[1]) - - if cpu < 5 : - cpu = 5 - -# Distribute threads among functions using the following rules : -# There are more (or equal) refs threads than others -# There are more (or equal) defs threads than docs or comps threads -# Example : if cpu=6 : defs=1, refs=2, docs=1, comps=1, comps_docs=1 -# if cpu=7 : defs=2, refs=2, docs=1, comps=1, comps_docs=1 -# if cpu=8 : defs=2, refs=3, docs=1, comps=1, comps_docs=1 -# if cpu=11: defs=2, refs=3, docs=2, comps=2, comps_docs=2 -quo, rem = divmod(cpu, 5) -num_th_refs = quo -num_th_defs = quo -num_th_docs = quo - -# If DT bindings support is enabled, use $quo threads for each of the 2 threads -# Otherwise add them to the remaining threads -if dts_comp_support: - num_th_comps = quo - num_th_comps_docs = quo -else : - num_th_comps = 0 - num_th_comps_docs = 0 - rem += 2*quo - -quo, rem = divmod(rem, 2) -num_th_defs += quo -num_th_refs += quo + rem - -tag_buf = [] -for tag in scriptLines('list-tags'): - if not db.vers.exists(tag): - tag_buf.append(tag) - -num_tags = len(tag_buf) -project = lib.currentProject() - -print(project + ' - found ' + str(num_tags) + ' new tags') - -if not num_tags: - # Backward-compatibility: generate defs caches if they are empty. - if db.defs_cache['C'].db.stat()['nkeys'] == 0: - generate_defs_caches() - exit(0) - -threads_list.append(UpdateIds(tag_buf)) -threads_list.append(UpdateVersions(tag_buf)) - -# Define defs threads -for i in range(num_th_defs): - threads_list.append(UpdateDefs(i, num_th_defs)) -# Define refs threads -for i in range(num_th_refs): - threads_list.append(UpdateRefs(i, num_th_refs)) -# Define docs threads -for i in range(num_th_docs): - threads_list.append(UpdateDocs(i, num_th_docs)) -# Define comps threads -for i in range(num_th_comps): - threads_list.append(UpdateComps(i, num_th_comps)) -# Define comps_docs threads -for i in range(num_th_comps_docs): - threads_list.append(UpdateCompsDocs(i, num_th_comps_docs)) - - -# Start to process tags -threads_list[0].start() - -# Wait until the first tag is ready -with tag_ready: - tag_ready.wait() - -# Start remaining threads -for i in range(1, len(threads_list)): - threads_list[i].start() - -# Make sure all threads finished -for i in range(len(threads_list)): - threads_list[i].join() From 3de821566e5f397230470f037ffbea65950299c6 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Wed, 2 Jul 2025 14:19:49 +0200 Subject: [PATCH 05/29] update: Fix versions database issues Attempt to merge two list-blobs calls during a refactoring completely broke collect_blobs. * Redundant collect_blobs call caused values in vers database to be replaced by a list where all files for a version had the same blob id * Vers database contained filenames instead of paths * Blobs shared between versions had new blob ids for each version --- elixir/update.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/elixir/update.py b/elixir/update.py index cbbde79a..395b9543 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -1,3 +1,4 @@ +import os.path import logging from multiprocessing import cpu_count from multiprocessing.pool import Pool @@ -116,17 +117,20 @@ def collect_blobs(db: DB, tag: bytes) -> Dict[int, Tuple[bytes, str]]: idx = 0 # Get blob hashes and associated file names (without path) - blobs = scriptLines('list-blobs', '-f', tag) + blobs = scriptLines('list-blobs', '-p', tag) versionBuf = [] idx_to_hash_and_filename = {} # Collect new blobs, assign database ids to the blobs for blob in blobs: - hash, filename = blob.split(b' ',maxsplit=1) - blob_exist = db.blob.exists(hash) - versionBuf.append((idx, filename)) - if not blob_exist: - idx_to_hash_and_filename[idx] = (hash, filename.decode()) + hash, path = blob.split(b' ',maxsplit=1) + filename = os.path.basename(path.decode()) + blob_idx = db.blob.get(hash) + if blob_idx is not None: + versionBuf.append((blob_idx, path)) + else: + versionBuf.append((idx, path)) + idx_to_hash_and_filename[idx] = (hash, filename) db.blob.put(hash, idx) db.hash.put(idx, hash) db.file.put(idx, filename) @@ -138,8 +142,8 @@ def collect_blobs(db: DB, tag: bytes) -> Dict[int, Tuple[bytes, str]]: # Add mapping blob id -> path to version database versionBuf.sort() obj = PathList() - for idx, path in versionBuf: - obj.append(idx, path) + for i, path in versionBuf: + obj.append(i, path) db.vers.put(tag, obj, sync=True) return idx_to_hash_and_filename @@ -274,7 +278,6 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): chunksize = int(len(idxes) / cpu_count()) chunksize = min(max(1, chunksize), 100) - collect_blobs(db, tag) logger.info("collecting blobs done") for result in pool.imap_unordered(get_defs, idxes, chunksize): From d72291dacbbddeb8af7575f52e416faae3cc87c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Wed, 18 Jun 2025 17:17:47 +0200 Subject: [PATCH 06/29] utils/index: fix to use reworked update script MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Théo Lebrun --- utils/index | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/utils/index b/utils/index index 6e84a3e7..61250a22 100755 --- a/utils/index +++ b/utils/index @@ -59,14 +59,10 @@ project_fetch() { # $1 is the project path (parent of data/ and repo/). project_index() { - if test -z "$ELIXIR_THREADS"; then - ELIXIR_THREADS="$(nproc)" - fi - elixir_sources="$(dirname "$(dirname "$0")")" LXR_REPO_DIR=$1/repo LXR_DATA_DIR=$1/data \ - python3 "$elixir_sources/update.py" $ELIXIR_THREADS + python3 -m elixir.update } # $1 is the Elixir root data path. From 5aa8192842dba0e2894500b764798dc1aea67bbf Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Wed, 2 Jul 2025 15:37:06 +0200 Subject: [PATCH 07/29] update: Add references only if definition visible in version Fixes #292 Previous attempt didn't actually work and caused references to be missing if definition was in an older file. --- elixir/update.py | 71 +++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 28 deletions(-) diff --git a/elixir/update.py b/elixir/update.py index 395b9543..eb19fa55 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -2,7 +2,7 @@ import logging from multiprocessing import cpu_count from multiprocessing.pool import Pool -from typing import Dict, Iterable, List, Optional, Tuple +from typing import Dict, Iterable, List, Optional, Tuple, Set from find_compatible_dts import FindCompatibleDTS @@ -29,42 +29,52 @@ # References parsing output, ident -> (file_idx, family) -> list of lines RefsDict = Dict[bytes, Dict[Tuple[int, str], List[int]]] -# Cache of definitions found in current tag, ident -> list of (file_idx, line) -DefCache = Dict[bytes, List[Tuple[int, int]]] - # Generic dictionary of ident -> list of lines LinesListDict = Dict[str, List[int]] +# File idx -> (hash, filename, is a new file?) +IdxCache = Dict[int, Tuple[bytes, str, bool]] + +# Check if definition for ident is visible in current version +def def_in_version(db: DB, def_cache: Set[bytes], idx_to_hash_and_filename: IdxCache, ident: bytes) -> bool: + if ident in def_cache: + return True + + defs_this_ident = db.defs.get(ident) + if not defs_this_ident: + return False + + for def_idx, _, _, _ in defs_this_ident.iter(): + if def_idx in idx_to_hash_and_filename: + def_cache.add(ident) + return True + + return False + # Add definitions to database -def add_defs(db: DB, def_cache: DefCache, defs: DefsDict): +def add_defs(db: DB, defs: DefsDict): for ident, occ_list in defs.items(): obj = db.defs.get(ident) if obj is None: obj = DefList() - if ident in def_cache: - lines_list = def_cache[ident] - else: - lines_list = [] - def_cache[ident] = lines_list - for (idx, type, line, family) in occ_list: obj.append(idx, type, line, family) - lines_list.append((idx, line)) db.defs.put(ident, obj) # Add references to database -def add_refs(db: DB, def_cache: DefCache, refs: RefsDict): +def add_refs(db: DB, def_cache: Set[bytes], idx_to_hash_and_filename: IdxCache, refs: RefsDict): for ident, idx_to_lines in refs.items(): - # Skip reference if definition was not collected in this tag - deflist = def_cache.get(ident) - if deflist is None: + deflist = db.defs.get(ident) + in_version = def_in_version(db, def_cache, idx_to_hash_and_filename, ident) + + if deflist is None or not in_version: continue - def deflist_exists(idx, n): - for didx, dn in deflist: - if didx == idx and dn == n: + def deflist_exists(idx: int, line: int): + for def_idx, _, def_line, _ in deflist.iter(): + if def_idx == idx and def_line == line: return True return False @@ -72,14 +82,17 @@ def deflist_exists(idx, n): if obj is None: obj = RefList() + modified = False for (idx, family), lines in idx_to_lines.items(): - lines = [n for n in lines if not deflist_exists(str(idx).encode(), n)] + lines = [n for n in lines if not deflist_exists(idx, n)] if len(lines) != 0: lines_str = ','.join((str(n) for n in lines)) obj.append(idx, lines_str, family) + modified = True - db.refs.put(ident, obj) + if modified: + db.refs.put(ident, obj) # Add documentation references to database def add_docs(db: DB, idx: int, family: str, docs: Dict[str, List[int]]): @@ -111,7 +124,7 @@ def add_to_lineslist(db_file: BsdDB, idx: int, family: str, to_add: Dict[str, Li # Adds blob list to database, returns blob id -> (hash, filename) dict -def collect_blobs(db: DB, tag: bytes) -> Dict[int, Tuple[bytes, str]]: +def collect_blobs(db: DB, tag: bytes) -> IdxCache: idx = db.vars.get('numBlobs') if idx is None: idx = 0 @@ -126,11 +139,14 @@ def collect_blobs(db: DB, tag: bytes) -> Dict[int, Tuple[bytes, str]]: hash, path = blob.split(b' ',maxsplit=1) filename = os.path.basename(path.decode()) blob_idx = db.blob.get(hash) + if blob_idx is not None: versionBuf.append((blob_idx, path)) + if blob_idx not in idx_to_hash_and_filename: + idx_to_hash_and_filename[blob_idx] = (hash, filename, False) else: versionBuf.append((idx, path)) - idx_to_hash_and_filename[idx] = (hash, filename) + idx_to_hash_and_filename[idx] = (hash, filename, True) db.blob.put(hash, idx) db.hash.put(idx, hash) db.file.put(idx, filename) @@ -271,10 +287,9 @@ def get_comps_docs(file_id: FileId) -> Optional[Tuple[int, str, LinesListDict]]: # Update a single version - collects data from all the stages and saves it in the database def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): idx_to_hash_and_filename = collect_blobs(db, tag) - def_cache = {} # Collect blobs to process and split list of blobs into chunks - idxes = [(idx, hash, filename) for (idx, (hash, filename)) in idx_to_hash_and_filename.items()] + idxes = [(idx, hash, filename) for (idx, (hash, filename, new)) in idx_to_hash_and_filename.items() if new] chunksize = int(len(idxes) / cpu_count()) chunksize = min(max(1, chunksize), 100) @@ -282,7 +297,7 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): for result in pool.imap_unordered(get_defs, idxes, chunksize): if result is not None: - add_defs(db, def_cache, result) + add_defs(db, result) logger.info("defs done") @@ -305,16 +320,16 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): logger.info("dts comps docs done") + def_cache = set() for result in pool.imap_unordered(get_refs, idxes, chunksize): if result is not None: - add_refs(db, def_cache, result) + add_refs(db, def_cache, idx_to_hash_and_filename, result) logger.info("refs done") generate_defs_caches(db) logger.info("update done") - if __name__ == "__main__": dts_comp_support = bool(int(script('dts-comp'))) db = None From 1e6624dd0500714efbb5aa67af66e5209be186f8 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sat, 5 Jul 2025 15:36:51 +0200 Subject: [PATCH 08/29] add db cache --- elixir/data.py | 93 +++++++++++++++++++++++++++++++++++++++++++----- elixir/update.py | 22 ++++++------ 2 files changed, 95 insertions(+), 20 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index 54a0796c..7bc218fc 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -18,6 +18,7 @@ # You should have received a copy of the GNU Affero General Public License # along with Elixir. If not, see . +from typing import OrderedDict import berkeleydb import re from . import lib @@ -208,18 +209,94 @@ def close(self): def __len__(self): return self.db.stat()["nkeys"] +class CachedBsdDB: + def __init__(self, filename, readonly, contentType, cachesize): + self.filename = filename + self.db = berkeleydb.db.DB() + flags = 0 + + self.cachesize = cachesize + self.cache = OrderedDict() + + if readonly: + flags |= berkeleydb.db.DB_RDONLY + self.db.open(filename, flags=flags) + else: + flags |= berkeleydb.db.DB_CREATE + self.db.open(filename, flags=flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE) + self.ctype = contentType + + def exists(self, key): + if key in self.cache: + return True + + key = autoBytes(key) + return self.db.exists(key) + + def get(self, key): + if key in self.cache: + self.cache.move_to_end(key) + return self.cache[key] + + key = autoBytes(key) + p = self.db.get(key) + if p is None: + return None + p = self.ctype(p) + + self.cache[key] = p + self.cache.move_to_end(key) + if len(self.cache) > self.cachesize: + old_k, old_v = self.cache.popitem(last=False) + self.put_raw(old_k, old_v) + + return p + + def get_keys(self): + return self.db.keys() + + def put(self, key, val): + self.cache[key] = val + self.cache.move_to_end(key) + if len(self.cache) > self.cachesize: + old_k, old_v = self.cache.popitem(last=False) + self.put_raw(old_k, old_v) + + def put_raw(self, key, val, sync=False): + key = autoBytes(key) + val = autoBytes(val) + if type(val) is not bytes: + val = val.pack() + self.db.put(key, val) + if sync: + self.db.sync() + + def sync(self): + for k, v in self.cache.items(): + self.put_raw(k, v) + + self.db.sync() + + def close(self): + self.sync() + self.db.close() + + def __len__(self): + return self.db.stat()["nkeys"] + class DB: - def __init__(self, dir, readonly=True, dtscomp=False, shared=False, update_cache=False): + def __init__(self, dir, readonly=True, dtscomp=False, shared=False, update_cache=None): if os.path.isdir(dir): self.dir = dir else: raise FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), dir) ro = readonly - cachesize = None if update_cache: - cachesize = CACHESIZE + db_cls = lambda dir, ro, ctype: CachedBsdDB(dir, ro, ctype, cachesize=update_cache) + else: + db_cls = lambda dir, ro, ctype: BsdDB(dir, ro, ctype, shared=shared) self.vars = BsdDB(dir + '/variables.db', ro, lambda x: int(x.decode()), shared=shared) # Key-value store of basic information @@ -230,7 +307,7 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False, update_cache self.file = BsdDB(dir + '/filenames.db', ro, lambda x: x.decode(), shared=shared) # Map serial number to filename self.vers = BsdDB(dir + '/versions.db', ro, PathList, shared=shared) - self.defs = BsdDB(dir + '/definitions.db', ro, DefList, shared=shared, cachesize=cachesize) + self.defs = db_cls(dir + '/definitions.db', ro, DefList) self.defs_cache = {} NOOP = lambda x: x self.defs_cache['C'] = BsdDB(dir + '/definitions-cache-C.db', ro, NOOP, shared=shared) @@ -238,12 +315,12 @@ def __init__(self, dir, readonly=True, dtscomp=False, shared=False, update_cache self.defs_cache['D'] = BsdDB(dir + '/definitions-cache-D.db', ro, NOOP, shared=shared) self.defs_cache['M'] = BsdDB(dir + '/definitions-cache-M.db', ro, NOOP, shared=shared) assert sorted(self.defs_cache.keys()) == sorted(lib.CACHED_DEFINITIONS_FAMILIES) - self.refs = BsdDB(dir + '/references.db', ro, RefList, shared=shared, cachesize=cachesize) - self.docs = BsdDB(dir + '/doccomments.db', ro, RefList, shared=shared, cachesize=cachesize) + self.refs = db_cls(dir + '/references.db', ro, RefList) + self.docs = db_cls(dir + '/doccomments.db', ro, RefList) self.dtscomp = dtscomp if dtscomp: - self.comps = BsdDB(dir + '/compatibledts.db', ro, RefList, shared=shared, cachesize=cachesize) - self.comps_docs = BsdDB(dir + '/compatibledts_docs.db', ro, RefList, shared=shared, cachesize=cachesize) + self.comps = db_cls(dir + '/compatibledts.db', ro, RefList) + self.comps_docs = db_cls(dir + '/compatibledts_docs.db', ro, RefList) # Use a RefList in case there are multiple doc comments for an identifier def close(self): diff --git a/elixir/update.py b/elixir/update.py index eb19fa55..f2ecd915 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -1,6 +1,6 @@ import os.path import logging -from multiprocessing import cpu_count +from multiprocessing import cpu_count, set_start_method from multiprocessing.pool import Pool from typing import Dict, Iterable, List, Optional, Tuple, Set @@ -36,17 +36,13 @@ IdxCache = Dict[int, Tuple[bytes, str, bool]] # Check if definition for ident is visible in current version -def def_in_version(db: DB, def_cache: Set[bytes], idx_to_hash_and_filename: IdxCache, ident: bytes) -> bool: - if ident in def_cache: - return True - +def def_in_version(db: DB, idx_to_hash_and_filename: IdxCache, ident: bytes) -> bool: defs_this_ident = db.defs.get(ident) if not defs_this_ident: return False for def_idx, _, _, _ in defs_this_ident.iter(): if def_idx in idx_to_hash_and_filename: - def_cache.add(ident) return True return False @@ -64,12 +60,14 @@ def add_defs(db: DB, defs: DefsDict): db.defs.put(ident, obj) # Add references to database -def add_refs(db: DB, def_cache: Set[bytes], idx_to_hash_and_filename: IdxCache, refs: RefsDict): +def add_refs(db: DB, idx_to_hash_and_filename: IdxCache, refs: RefsDict): for ident, idx_to_lines in refs.items(): deflist = db.defs.get(ident) - in_version = def_in_version(db, def_cache, idx_to_hash_and_filename, ident) + if deflist is None: + continue - if deflist is None or not in_version: + in_version = def_in_version(db, idx_to_hash_and_filename, ident) + if not in_version: continue def deflist_exists(idx: int, line: int): @@ -320,10 +318,9 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): logger.info("dts comps docs done") - def_cache = set() for result in pool.imap_unordered(get_refs, idxes, chunksize): if result is not None: - add_refs(db, def_cache, idx_to_hash_and_filename, result) + add_refs(db, idx_to_hash_and_filename, result) logger.info("refs done") @@ -334,10 +331,11 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): dts_comp_support = bool(int(script('dts-comp'))) db = None + set_start_method('spawn') with Pool() as pool: for tag in scriptLines('list-tags'): if db is None: - db = DB(getDataDir(), readonly=False, dtscomp=dts_comp_support, shared=False, update_cache=True) + db = DB(getDataDir(), readonly=False, dtscomp=dts_comp_support, shared=False, update_cache=50000) if not db.vers.exists(tag): logger.info("updating tag %s", tag) From 8b339a4b04cb83d74fc8f9338953fceaff7b2a9e Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sat, 5 Jul 2025 15:49:16 +0200 Subject: [PATCH 09/29] do not close db between tags --- elixir/update.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/elixir/update.py b/elixir/update.py index f2ecd915..0c69a3ee 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -329,17 +329,14 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): if __name__ == "__main__": dts_comp_support = bool(int(script('dts-comp'))) - db = None + db = DB(getDataDir(), readonly=False, dtscomp=dts_comp_support, shared=False, update_cache=100000) set_start_method('spawn') with Pool() as pool: for tag in scriptLines('list-tags'): - if db is None: - db = DB(getDataDir(), readonly=False, dtscomp=dts_comp_support, shared=False, update_cache=50000) - if not db.vers.exists(tag): logger.info("updating tag %s", tag) update_version(db, tag, pool, dts_comp_support) - db.close() - db = None + + db.close() From 6376e496573fa65bea99ac4307ea13ecfd070461 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sat, 5 Jul 2025 16:54:24 +0200 Subject: [PATCH 10/29] attempt to improve reflist and deflist performance --- elixir/data.py | 80 ++++++++++++++++++++++++++++++------------------ elixir/update.py | 10 ++++-- 2 files changed, 59 insertions(+), 31 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index 7bc218fc..0cfe728f 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -64,24 +64,33 @@ class DefList: a line number and a file family. Also stores in which families the ident exists for faster tests.''' def __init__(self, data=b'#'): - self.data, self.families = data.split(b'#') + data, self.families = data.split(b'#') + self.entries = [self.decode_entry(d) for d in deflist_regex.findall(data)] + self.sorted = False + + def decode_entry(self, entry): + id = int(entry[0]) + type = defTypeR [entry[1].decode()] + line = int(entry[2]) + family = entry[3].decode() + return id, type, line, family + + def encode_entry(self, entry): + return str(entry[0]) + defTypeD[entry[1]] + str(entry[2]) + entry[3] def iter(self, dummy=False): # Get all element in a list of sublists and sort them - entries = deflist_regex.findall(self.data) - entries.sort(key=lambda x:int(x[0])) - for id, type, line, family in entries: - id = int(id) - type = defTypeR [type.decode()] - line = int(line) - family = family.decode() + if not self.sorted: + self.entries.sort(key=lambda x:int(x[0])) + self.sorted = True + + for id, type, line, family in self.entries: yield id, type, line, family if dummy: yield maxId, None, None, None def exists(self, idx, line_num): - entries = deflist_regex.findall(self.data) - for id, _, line, _ in entries: + for id, _, line, _ in self.entries: if id == idx and int(line) == line_num: return True @@ -90,14 +99,18 @@ def exists(self, idx, line_num): def append(self, id, type, line, family): if type not in defTypeD: return - p = str(id) + defTypeD[type] + str(line) + family - if self.data != b'': - p = ',' + p - self.data += p.encode() + + self.sorted = False + self.entries.append((id, type, line, family)) self.add_family(family) def pack(self): - return self.data + b'#' + self.families + if not self.sorted: + self.entries.sort(key=lambda x:int(x[0])) + self.sorted = True + + data = ",".join(self.encode_entry(entry) for entry in self.entries) + return data.encode() + b'#' + self.families def add_family(self, family): family = family.encode() @@ -110,7 +123,7 @@ def get_families(self): return self.families.decode().split(',') def get_macros(self): - return deflist_macro_regex.findall(self.data.decode()) or '' + return [entry[3] for entry in self.entries if entry[1] == 'macro'] class PathList: '''Stores associations between a blob ID and a file path. @@ -139,25 +152,36 @@ class RefList: and the corresponding family.''' def __init__(self, data=b''): self.data = data + self.entries = [self.decode_entry(x.split(b':')) for x in self.data.split(b'\n')[:-1]] + self.sorted = False + + def decode_entry(self, k): + return (int(k[0].decode()), k[1].decode(), k[2].decode()) def iter(self, dummy=False): # Split all elements in a list of sublists and sort them - entries = [x.split(b':') for x in self.data.split(b'\n')[:-1]] - entries.sort(key=lambda x:int(x[0])) - for b, c, d in entries: - b = int(b.decode()) - c = c.decode() - d = d.decode() + if not self.sorted: + self.sorted = True + self.entries.sort(key=lambda x:int(x[0])) + + for b, c, d in self.entries: yield b, c, d if dummy: yield maxId, None, None def append(self, id, lines, family): - p = str(id) + ':' + lines + ':' + family + '\n' - self.data += p.encode() + self.sorted = False + self.entries.append((id, lines, family)) def pack(self): - return self.data + if not self.sorted: + self.sorted = True + self.entries.sort(key=lambda x:int(x[0])) + + result = "" + for id, lines, family in self.entries: + result += str(id) + ":" + lines + ":" + family + "\n" + return result.encode() class BsdDB: def __init__(self, filename, readonly, contentType, shared=False, cachesize=None): @@ -230,16 +254,14 @@ def exists(self, key): if key in self.cache: return True - key = autoBytes(key) - return self.db.exists(key) + return self.db.exists(autoBytes(key)) def get(self, key): if key in self.cache: self.cache.move_to_end(key) return self.cache[key] - key = autoBytes(key) - p = self.db.get(key) + p = self.db.get(autoBytes(key)) if p is None: return None p = self.ctype(p) diff --git a/elixir/update.py b/elixir/update.py index 0c69a3ee..ab2e1a31 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -306,7 +306,10 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): logger.info("docs done") if dts_comp_support: - for result in pool.imap_unordered(get_comps, idxes, chunksize): + comp_idxes = [idx for idx in idxes if getFileFamily(idx[2]) not in (None, 'K', 'M')] + comp_chunksize = int(len(comp_idxes) / cpu_count()) + comp_chunksize = min(max(1, comp_chunksize), 100) + for result in pool.imap_unordered(get_comps, comp_idxes, comp_chunksize): if result is not None: add_comps(db, *result) @@ -318,7 +321,10 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): logger.info("dts comps docs done") - for result in pool.imap_unordered(get_refs, idxes, chunksize): + ref_idxes = [idx for idx in idxes if getFileFamily(idx[2]) is not None] + ref_chunksize = int(len(ref_idxes) / cpu_count()) + ref_chunksize = min(max(1, ref_chunksize), 100) + for result in pool.imap_unordered(get_refs, ref_idxes, ref_chunksize): if result is not None: add_refs(db, idx_to_hash_and_filename, result) From 19e81c643a800cf1ace9aafad2442ce1c0dedc34 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sat, 5 Jul 2025 17:29:43 +0200 Subject: [PATCH 11/29] print slow docs/comps docs --- elixir/update.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/elixir/update.py b/elixir/update.py index ab2e1a31..01ea1894 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -1,5 +1,6 @@ import os.path import logging +import time from multiprocessing import cpu_count, set_start_method from multiprocessing.pool import Pool from typing import Dict, Iterable, List, Optional, Tuple, Set @@ -247,7 +248,13 @@ def get_docs(file_id: FileId) -> Optional[Tuple[int, str, LinesListDict]]: family = getFileFamily(filename) if family in (None, 'M'): return + start = time.time() lines = (line.decode() for line in scriptLines('parse-docs', hash, filename)) + parser_time = time.time()-start + + if parser_time > 10: + print("docs timeout", parser_time, file_id) + docs = collect_get_blob_output(lines) return (idx, family, docs) @@ -259,7 +266,14 @@ def get_comps(file_id: FileId) -> Optional[Tuple[int, str, LinesListDict]]: if family in (None, 'K', 'M'): return compatibles_parser = FindCompatibleDTS() + + start = time.time() lines = compatibles_parser.run(scriptLines('get-blob', hash), family) + parser_time = time.time()-start + + if parser_time > 10: + print("comps docs timeout", parser_time, file_id) + comps = collect_get_blob_output(lines) return (idx, family, comps) @@ -340,6 +354,9 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): set_start_method('spawn') with Pool() as pool: for tag in scriptLines('list-tags'): + if not tag.startswith(b'v6.1') or b'rc' in tag: + continue + if not db.vers.exists(tag): logger.info("updating tag %s", tag) update_version(db, tag, pool, dts_comp_support) From a284c92b967f89f21ed64123a49660cc5cb0d564 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sat, 5 Jul 2025 18:20:41 +0200 Subject: [PATCH 12/29] generate defs caches at the end of update --- elixir/update.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/elixir/update.py b/elixir/update.py index 01ea1894..bba102b3 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -343,8 +343,6 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): add_refs(db, idx_to_hash_and_filename, result) logger.info("refs done") - - generate_defs_caches(db) logger.info("update done") if __name__ == "__main__": @@ -361,5 +359,6 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): logger.info("updating tag %s", tag) update_version(db, tag, pool, dts_comp_support) + generate_defs_caches(db) db.close() From 58fca892285630ea8c1cbc7c47c2b2848a6a7f6f Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sat, 5 Jul 2025 19:10:45 +0200 Subject: [PATCH 13/29] check def existence in get_refs --- elixir/data.py | 2 +- elixir/update.py | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index 0cfe728f..c8d6ed8b 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -298,7 +298,7 @@ def sync(self): self.put_raw(k, v) self.db.sync() - + def close(self): self.sync() self.db.close() diff --git a/elixir/update.py b/elixir/update.py index bba102b3..8d0a22a6 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -3,7 +3,7 @@ import time from multiprocessing import cpu_count, set_start_method from multiprocessing.pool import Pool -from typing import Dict, Iterable, List, Optional, Tuple, Set +from typing import Dict, Iterable, List, Optional, Tuple from find_compatible_dts import FindCompatibleDTS @@ -194,8 +194,11 @@ def get_defs(file_id: FileId) -> Optional[DefsDict]: return defs +def call_get_refs(arg: Tuple[FileId, str]) -> Optional[RefsDict]: + return get_refs(arg[0], BsdDB(arg[1], True, lambda x: x)) + # Collect references from the tokenizer for a file -def get_refs(file_id: FileId) -> Optional[RefsDict]: +def get_refs(file_id: FileId, defs: BsdDB) -> Optional[RefsDict]: idx, hash, filename = file_id refs = {} family = getFileFamily(filename) @@ -216,6 +219,9 @@ def get_refs(file_id: FileId) -> Optional[RefsDict]: # We only index CONFIG_??? in makefiles if (family != 'M' or tok.startswith(b'CONFIG_')): + if not defs.exists(tok): + continue + if tok not in refs: refs[tok] = {} @@ -335,10 +341,13 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): logger.info("dts comps docs done") - ref_idxes = [idx for idx in idxes if getFileFamily(idx[2]) is not None] + + db.defs.sync() + + ref_idxes = [(idx, db.defs.filename) for idx in idxes if getFileFamily(idx[2]) is not None] ref_chunksize = int(len(ref_idxes) / cpu_count()) ref_chunksize = min(max(1, ref_chunksize), 100) - for result in pool.imap_unordered(get_refs, ref_idxes, ref_chunksize): + for result in pool.imap_unordered(call_get_refs, ref_idxes, ref_chunksize): if result is not None: add_refs(db, idx_to_hash_and_filename, result) @@ -352,8 +361,8 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): set_start_method('spawn') with Pool() as pool: for tag in scriptLines('list-tags'): - if not tag.startswith(b'v6.1') or b'rc' in tag: - continue + #if not tag.startswith(b'v6.1') or b'rc' in tag: + # continue if not db.vers.exists(tag): logger.info("updating tag %s", tag) From 00ba0047f66d6cb7883c9a0369fddcd15a1d76a7 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sat, 5 Jul 2025 19:33:44 +0200 Subject: [PATCH 14/29] optiimze find_compatible_dts for gigantic amd driver files --- find_compatible_dts.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/find_compatible_dts.py b/find_compatible_dts.py index 8aec94d6..a1a356f1 100755 --- a/find_compatible_dts.py +++ b/find_compatible_dts.py @@ -31,6 +31,8 @@ def __init__(self): self.regex_bindings = re.compile("([\w-]+,?[\w-]+)") def parse_c(self, content): + if "compatible" not in content: + return [] return self.regex_c.findall(content) def parse_dts(self, content): From d4b79b0a35497127e01e5cdae6b79731bae22de1 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sat, 5 Jul 2025 19:48:34 +0200 Subject: [PATCH 15/29] query deflist once, cprofile add refs --- elixir/data.py | 21 ++++++++++++++------- elixir/update.py | 23 +++++++++++------------ 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index c8d6ed8b..89aee36e 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -187,19 +187,26 @@ class BsdDB: def __init__(self, filename, readonly, contentType, shared=False, cachesize=None): self.filename = filename self.db = berkeleydb.db.DB() - flags = berkeleydb.db.DB_THREAD if shared else 0 + self.flags = berkeleydb.db.DB_THREAD if shared else 0 + + self.readonly = readonly + if self.readonly: + self.flags |= berkeleydb.db.DB_RDONLY + else: + self.flags |= berkeleydb.db.DB_CREATE if cachesize is not None: self.db.set_cachesize(cachesize[0], cachesize[1]) - if readonly: - flags |= berkeleydb.db.DB_RDONLY - self.db.open(filename, flags=flags) - else: - flags |= berkeleydb.db.DB_CREATE - self.db.open(filename, flags=flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE) + self.open() self.ctype = contentType + def open(self): + if self.readonly: + self.db.open(self.filename, flags=self.flags) + else: + self.db.open(self.filename, flags=self.flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE) + def exists(self, key): key = autoBytes(key) return self.db.exists(key) diff --git a/elixir/update.py b/elixir/update.py index 8d0a22a6..6807a47e 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -1,6 +1,7 @@ import os.path import logging import time +import cProfile from multiprocessing import cpu_count, set_start_method from multiprocessing.pool import Pool from typing import Dict, Iterable, List, Optional, Tuple @@ -40,13 +41,13 @@ def def_in_version(db: DB, idx_to_hash_and_filename: IdxCache, ident: bytes) -> bool: defs_this_ident = db.defs.get(ident) if not defs_this_ident: - return False + return None for def_idx, _, _, _ in defs_this_ident.iter(): if def_idx in idx_to_hash_and_filename: - return True + return defs_this_ident - return False + return None # Add definitions to database def add_defs(db: DB, defs: DefsDict): @@ -63,12 +64,8 @@ def add_defs(db: DB, defs: DefsDict): # Add references to database def add_refs(db: DB, idx_to_hash_and_filename: IdxCache, refs: RefsDict): for ident, idx_to_lines in refs.items(): - deflist = db.defs.get(ident) - if deflist is None: - continue - - in_version = def_in_version(db, idx_to_hash_and_filename, ident) - if not in_version: + deflist = def_in_version(db, idx_to_hash_and_filename, ident) + if not deflist: continue def deflist_exists(idx: int, line: int): @@ -347,9 +344,11 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): ref_idxes = [(idx, db.defs.filename) for idx in idxes if getFileFamily(idx[2]) is not None] ref_chunksize = int(len(ref_idxes) / cpu_count()) ref_chunksize = min(max(1, ref_chunksize), 100) - for result in pool.imap_unordered(call_get_refs, ref_idxes, ref_chunksize): - if result is not None: - add_refs(db, idx_to_hash_and_filename, result) + with cProfile.Profile() as pr: + for result in pool.imap_unordered(call_get_refs, ref_idxes, ref_chunksize): + if result is not None: + add_refs(db, idx_to_hash_and_filename, result) + pr.dump_stats("refs"+str(int(time.time()))) logger.info("refs done") logger.info("update done") From 82fa4429198ee034fe109c707cd13106aa28e019 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sat, 5 Jul 2025 22:20:25 +0200 Subject: [PATCH 16/29] deflist to dict --- elixir/data.py | 35 +++++++++++++++++++---------------- elixir/update.py | 18 ++++++++++-------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index 89aee36e..ec64a976 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -65,8 +65,15 @@ class DefList: Also stores in which families the ident exists for faster tests.''' def __init__(self, data=b'#'): data, self.families = data.split(b'#') - self.entries = [self.decode_entry(d) for d in deflist_regex.findall(data)] - self.sorted = False + + self.entries = OrderedDict() + tmp_entries = [self.decode_entry(d) for d in deflist_regex.findall(data)] + tmp_entries.sort(key=lambda x:int(x[0])) + for id, type, line, family in tmp_entries: + if id not in self.entries: + self.entries[id] = [(type, line, family)] + else: + self.entries[id].append((type, line, family)) def decode_entry(self, entry): id = int(entry[0]) @@ -80,12 +87,9 @@ def encode_entry(self, entry): def iter(self, dummy=False): # Get all element in a list of sublists and sort them - if not self.sorted: - self.entries.sort(key=lambda x:int(x[0])) - self.sorted = True - - for id, type, line, family in self.entries: - yield id, type, line, family + for id, val in self.entries.items(): + for type, line, family in val: + yield id, type, line, family if dummy: yield maxId, None, None, None @@ -100,16 +104,15 @@ def append(self, id, type, line, family): if type not in defTypeD: return - self.sorted = False - self.entries.append((id, type, line, family)) + if id not in self.entries: + self.entries[id] = [(type, line, family)] + else: + self.entries[id].append((type, line, family)) + self.add_family(family) def pack(self): - if not self.sorted: - self.entries.sort(key=lambda x:int(x[0])) - self.sorted = True - - data = ",".join(self.encode_entry(entry) for entry in self.entries) + data = ",".join(self.encode_entry((id, *entry)) for id, vals in self.entries.items() for entry in vals) return data.encode() + b'#' + self.families def add_family(self, family): @@ -123,7 +126,7 @@ def get_families(self): return self.families.decode().split(',') def get_macros(self): - return [entry[3] for entry in self.entries if entry[1] == 'macro'] + return [entry[3] for val in self.entries.values() for entry in val if entry[1] == 'macro'] class PathList: '''Stores associations between a blob ID and a file path. diff --git a/elixir/update.py b/elixir/update.py index 6807a47e..3ead2a63 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -43,7 +43,7 @@ def def_in_version(db: DB, idx_to_hash_and_filename: IdxCache, ident: bytes) -> if not defs_this_ident: return None - for def_idx, _, _, _ in defs_this_ident.iter(): + for def_idx in defs_this_ident.entries.keys(): if def_idx in idx_to_hash_and_filename: return defs_this_ident @@ -69,9 +69,13 @@ def add_refs(db: DB, idx_to_hash_and_filename: IdxCache, refs: RefsDict): continue def deflist_exists(idx: int, line: int): - for def_idx, _, def_line, _ in deflist.iter(): - if def_idx == idx and def_line == line: + if idx not in deflist.entries: + return False + + for _, def_line, _ in deflist.entries[idx]: + if def_line == line: return True + return False obj = db.refs.get(ident) @@ -344,11 +348,9 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): ref_idxes = [(idx, db.defs.filename) for idx in idxes if getFileFamily(idx[2]) is not None] ref_chunksize = int(len(ref_idxes) / cpu_count()) ref_chunksize = min(max(1, ref_chunksize), 100) - with cProfile.Profile() as pr: - for result in pool.imap_unordered(call_get_refs, ref_idxes, ref_chunksize): - if result is not None: - add_refs(db, idx_to_hash_and_filename, result) - pr.dump_stats("refs"+str(int(time.time()))) + for result in pool.imap_unordered(call_get_refs, ref_idxes, ref_chunksize): + if result is not None: + add_refs(db, idx_to_hash_and_filename, result) logger.info("refs done") logger.info("update done") From c4103a2bb6ad8d29e8d70c8d5ac36981c41d0805 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 01:51:36 +0200 Subject: [PATCH 17/29] fix database segfaults --- elixir/data.py | 36 +++++++++++++++++++++++++++--------- elixir/update.py | 9 +++++++-- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index ec64a976..5b6255f3 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -246,19 +246,28 @@ def __len__(self): class CachedBsdDB: def __init__(self, filename, readonly, contentType, cachesize): self.filename = filename - self.db = berkeleydb.db.DB() - flags = 0 + self.db = None + self.readonly = readonly self.cachesize = cachesize self.cache = OrderedDict() - if readonly: + self.open() + + self.ctype = contentType + + def open(self): + if self.db is None: + self.db = berkeleydb.db.DB() + + flags = 0 + + if self.readonly: flags |= berkeleydb.db.DB_RDONLY - self.db.open(filename, flags=flags) + self.db.open(self.filename, flags=flags) else: flags |= berkeleydb.db.DB_CREATE - self.db.open(filename, flags=flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE) - self.ctype = contentType + self.db.open(self.filename, flags=flags, mode=0o644, dbtype=berkeleydb.db.DB_BTREE) def exists(self, key): if key in self.cache: @@ -280,7 +289,8 @@ def get(self, key): self.cache.move_to_end(key) if len(self.cache) > self.cachesize: old_k, old_v = self.cache.popitem(last=False) - self.put_raw(old_k, old_v) + if not self.readonly: + self.put_raw(old_k, old_v) return p @@ -288,6 +298,9 @@ def get_keys(self): return self.db.keys() def put(self, key, val): + if self.readonly: + raise Exception("database is readonly") + self.cache[key] = val self.cache.move_to_end(key) if len(self.cache) > self.cachesize: @@ -295,6 +308,9 @@ def put(self, key, val): self.put_raw(old_k, old_v) def put_raw(self, key, val, sync=False): + if self.readonly: + raise Exception("database is readonly") + key = autoBytes(key) val = autoBytes(val) if type(val) is not bytes: @@ -304,14 +320,16 @@ def put_raw(self, key, val, sync=False): self.db.sync() def sync(self): - for k, v in self.cache.items(): - self.put_raw(k, v) + if not self.readonly: + for k, v in self.cache.items(): + self.put_raw(k, v) self.db.sync() def close(self): self.sync() self.db.close() + self.db = None def __len__(self): return self.db.stat()["nkeys"] diff --git a/elixir/update.py b/elixir/update.py index 3ead2a63..f6626f33 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -1,7 +1,6 @@ import os.path import logging import time -import cProfile from multiprocessing import cpu_count, set_start_method from multiprocessing.pool import Pool from typing import Dict, Iterable, List, Optional, Tuple @@ -343,7 +342,9 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): logger.info("dts comps docs done") - db.defs.sync() + db.defs.close() + db.defs.readonly = True + db.defs.open() ref_idxes = [(idx, db.defs.filename) for idx in idxes if getFileFamily(idx[2]) is not None] ref_chunksize = int(len(ref_idxes) / cpu_count()) @@ -352,6 +353,10 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): if result is not None: add_refs(db, idx_to_hash_and_filename, result) + db.defs.close() + db.defs.readonly = False + db.defs.open() + logger.info("refs done") logger.info("update done") From b7c0fc047d90b082c3110af98a3c413ed92ffb33 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 02:15:10 +0200 Subject: [PATCH 18/29] catch sigint --- elixir/update.py | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/elixir/update.py b/elixir/update.py index f6626f33..2b60ca42 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -1,6 +1,7 @@ import os.path import logging import time +import signal from multiprocessing import cpu_count, set_start_method from multiprocessing.pool import Pool from typing import Dict, Iterable, List, Optional, Tuple @@ -360,20 +361,42 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): logger.info("refs done") logger.info("update done") + +sigint_caught = False + +def sigint_handler(signum, _frame): + global sigint_caught + if not sigint_caught: + logger.info("Caught SIGINT... the script will exit after processing this version") + signal.signal(signum, signal.SIG_IGN) + sigint_caught = True + +signal.signal(signal.SIGINT, sigint_handler) + +def ignore_sigint(): + signal.signal(signal.SIGINT, lambda _,__: None) + if __name__ == "__main__": + dts_comp_support = bool(int(script('dts-comp'))) db = DB(getDataDir(), readonly=False, dtscomp=dts_comp_support, shared=False, update_cache=100000) set_start_method('spawn') - with Pool() as pool: + with Pool(initializer=ignore_sigint) as pool: for tag in scriptLines('list-tags'): #if not tag.startswith(b'v6.1') or b'rc' in tag: # continue + if sigint_caught: + break + if not db.vers.exists(tag): logger.info("updating tag %s", tag) update_version(db, tag, pool, dts_comp_support) + logger.info("generating def caches") generate_defs_caches(db) + logger.info("def caches generated") db.close() + From 3531c7c26b152bf2c617ff518c7e2009ada2ada1 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 02:48:23 +0200 Subject: [PATCH 19/29] add def in ver cache --- elixir/update.py | 50 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/elixir/update.py b/elixir/update.py index 2b60ca42..74887bba 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -5,6 +5,7 @@ from multiprocessing import cpu_count, set_start_method from multiprocessing.pool import Pool from typing import Dict, Iterable, List, Optional, Tuple +from collections import OrderedDict from find_compatible_dts import FindCompatibleDTS @@ -37,17 +38,30 @@ # File idx -> (hash, filename, is a new file?) IdxCache = Dict[int, Tuple[bytes, str, bool]] -# Check if definition for ident is visible in current version -def def_in_version(db: DB, idx_to_hash_and_filename: IdxCache, ident: bytes) -> bool: - defs_this_ident = db.defs.get(ident) - if not defs_this_ident: - return None +class Cache: + def __init__(self, size): + self.cache = OrderedDict() + self.size = size - for def_idx in defs_this_ident.entries.keys(): - if def_idx in idx_to_hash_and_filename: - return defs_this_ident + def contains(self, key): + return key in self.cache - return None + def get(self, key): + self.cache.move_to_end(key) + return self.cache[key] + + def put(self, key, val): + self.cache[key] = val + self.cache.move_to_end(key) + if len(self.cache) > self.size: + self.cache.popitem(last=False) + +# Check if definition for ident is visible in current version +def def_in_version(def_ident: DefList, idx_to_hash_and_filename: IdxCache) -> bool: + for def_idx in def_ident.entries.keys(): + if def_idx in idx_to_hash_and_filename: + return True + return False # Add definitions to database def add_defs(db: DB, defs: DefsDict): @@ -62,10 +76,19 @@ def add_defs(db: DB, defs: DefsDict): db.defs.put(ident, obj) # Add references to database -def add_refs(db: DB, idx_to_hash_and_filename: IdxCache, refs: RefsDict): +def add_refs(db: DB, in_ver_cache: Cache, idx_to_hash_and_filename: IdxCache, refs: RefsDict): for ident, idx_to_lines in refs.items(): - deflist = def_in_version(db, idx_to_hash_and_filename, ident) - if not deflist: + deflist = db.defs.get(ident) + if deflist is None: + continue + + if not in_ver_cache.contains(ident): + in_version = def_in_version(deflist, idx_to_hash_and_filename) + if not in_version: + in_ver_cache.put(ident, False) + continue + in_ver_cache.put(ident, True) + elif not in_ver_cache.get(ident): continue def deflist_exists(idx: int, line: int): @@ -347,12 +370,13 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): db.defs.readonly = True db.defs.open() + in_def_cache = Cache(10000) ref_idxes = [(idx, db.defs.filename) for idx in idxes if getFileFamily(idx[2]) is not None] ref_chunksize = int(len(ref_idxes) / cpu_count()) ref_chunksize = min(max(1, ref_chunksize), 100) for result in pool.imap_unordered(call_get_refs, ref_idxes, ref_chunksize): if result is not None: - add_refs(db, idx_to_hash_and_filename, result) + add_refs(db, in_def_cache, idx_to_hash_and_filename, result) db.defs.close() db.defs.readonly = False From 8c712e44192c115c64f2cfd96318e76457ace699 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 13:35:16 +0200 Subject: [PATCH 20/29] move more work to get_refs --- elixir/update.py | 52 +++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/elixir/update.py b/elixir/update.py index 74887bba..e36ea3b6 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -9,7 +9,7 @@ from find_compatible_dts import FindCompatibleDTS -from elixir.data import DB, BsdDB, DefList, PathList, RefList +from elixir.data import DB, BsdDB, CachedBsdDB, DefList, PathList, RefList from elixir.lib import ( compatibleFamily, compatibleMacro, @@ -91,31 +91,14 @@ def add_refs(db: DB, in_ver_cache: Cache, idx_to_hash_and_filename: IdxCache, re elif not in_ver_cache.get(ident): continue - def deflist_exists(idx: int, line: int): - if idx not in deflist.entries: - return False - - for _, def_line, _ in deflist.entries[idx]: - if def_line == line: - return True - - return False - obj = db.refs.get(ident) if obj is None: obj = RefList() - modified = False - for (idx, family), lines in idx_to_lines.items(): - lines = [n for n in lines if not deflist_exists(idx, n)] + for (idx, family), lines_str in idx_to_lines.items(): + obj.append(idx, lines_str, family) - if len(lines) != 0: - lines_str = ','.join((str(n) for n in lines)) - obj.append(idx, lines_str, family) - modified = True - - if modified: - db.refs.put(ident, obj) + db.refs.put(ident, obj) # Add documentation references to database def add_docs(db: DB, idx: int, family: str, docs: Dict[str, List[int]]): @@ -219,10 +202,10 @@ def get_defs(file_id: FileId) -> Optional[DefsDict]: return defs def call_get_refs(arg: Tuple[FileId, str]) -> Optional[RefsDict]: - return get_refs(arg[0], BsdDB(arg[1], True, lambda x: x)) + return get_refs(arg[0], CachedBsdDB(arg[1], True, DefList, 1000)) # Collect references from the tokenizer for a file -def get_refs(file_id: FileId, defs: BsdDB) -> Optional[RefsDict]: +def get_refs(file_id: FileId, defs: CachedBsdDB) -> Optional[RefsDict]: idx, hash, filename = file_id refs = {} family = getFileFamily(filename) @@ -236,6 +219,16 @@ def get_refs(file_id: FileId, defs: BsdDB) -> Optional[RefsDict]: even = True line_num = 1 + def deflist_exists(deflist, idx: int, line: int): + if idx not in deflist.entries: + return False + + for _, def_line, _ in deflist.entries[idx]: + if def_line == line: + return True + + return False + for tok in tokens: even = not even if even: @@ -243,20 +236,25 @@ def get_refs(file_id: FileId, defs: BsdDB) -> Optional[RefsDict]: # We only index CONFIG_??? in makefiles if (family != 'M' or tok.startswith(b'CONFIG_')): - if not defs.exists(tok): + deflist = defs.get(tok) + if not deflist: + continue + + if deflist_exists(deflist, idx, line_num): continue if tok not in refs: refs[tok] = {} if (idx, family) not in refs[tok]: - refs[tok][(idx, family)] = [] - - refs[tok][(idx, family)].append(line_num) + refs[tok][(idx, family)] = str(line_num) + else: + refs[tok][(idx, family)] += "," + str(line_num) else: line_num += tok.count(b'\1') + return refs # Collect compatible script output into lineslinst-schema compatible format From 1fa6949d2e3a0690fa0e19cd88a4b16de0bccfe2 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 14:13:25 +0200 Subject: [PATCH 21/29] optimize write-only reflist case --- elixir/data.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index 5b6255f3..92d9c64c 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -155,17 +155,19 @@ class RefList: and the corresponding family.''' def __init__(self, data=b''): self.data = data - self.entries = [self.decode_entry(x.split(b':')) for x in self.data.split(b'\n')[:-1]] + self.entries = None self.sorted = False def decode_entry(self, k): return (int(k[0].decode()), k[1].decode(), k[2].decode()) + def populate_entries(self): + self.entries = [self.decode_entry(x.split(b':')) for x in self.data.split(b'\n')[:-1]] + self.entries.sort(key=lambda x:int(x[0])) + def iter(self, dummy=False): - # Split all elements in a list of sublists and sort them - if not self.sorted: - self.sorted = True - self.entries.sort(key=lambda x:int(x[0])) + if self.entries is None: + self.populate_entries() for b, c, d in self.entries: yield b, c, d @@ -173,18 +175,19 @@ def iter(self, dummy=False): yield maxId, None, None def append(self, id, lines, family): - self.sorted = False - self.entries.append((id, lines, family)) + if self.entries is not None: + self.entries.append((id, lines, family)) + else: + self.data += (str(id) + ":" + lines + ":" + family + "\n").encode() def pack(self): - if not self.sorted: - self.sorted = True - self.entries.sort(key=lambda x:int(x[0])) - - result = "" - for id, lines, family in self.entries: - result += str(id) + ":" + lines + ":" + family + "\n" - return result.encode() + if self.entries is not None: + result = "" + for id, lines, family in self.entries: + result += str(id) + ":" + lines + ":" + family + "\n" + return result.encode() + else: + return self.data class BsdDB: def __init__(self, filename, readonly, contentType, shared=False, cachesize=None): From 8f65990a89e99d03814eeeb6b619290ced910def Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 14:21:05 +0200 Subject: [PATCH 22/29] refs append only opt pt 2 --- elixir/data.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/elixir/data.py b/elixir/data.py index 92d9c64c..eda0278b 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -156,6 +156,7 @@ class RefList: def __init__(self, data=b''): self.data = data self.entries = None + self.to_append = [] self.sorted = False def decode_entry(self, k): @@ -163,6 +164,8 @@ def decode_entry(self, k): def populate_entries(self): self.entries = [self.decode_entry(x.split(b':')) for x in self.data.split(b'\n')[:-1]] + self.entries += self.to_append + self.to_append = [] self.entries.sort(key=lambda x:int(x[0])) def iter(self, dummy=False): @@ -178,7 +181,7 @@ def append(self, id, lines, family): if self.entries is not None: self.entries.append((id, lines, family)) else: - self.data += (str(id) + ":" + lines + ":" + family + "\n").encode() + self.to_append.append((id, lines, family)) def pack(self): if self.entries is not None: @@ -187,6 +190,11 @@ def pack(self): result += str(id) + ":" + lines + ":" + family + "\n" return result.encode() else: + result = "" + for id, lines, family in self.to_append: + result += str(id) + ":" + lines + ":" + family + "\n" + self.data += result.encode() + self.to_append = [] return self.data class BsdDB: From f90a8afb4e27f254eacebfa5bd9ef7f2c5df9bcc Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 14:37:34 +0200 Subject: [PATCH 23/29] add number of blobs in logs --- elixir/update.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/elixir/update.py b/elixir/update.py index e36ea3b6..2a6a1a3d 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -333,7 +333,7 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): chunksize = int(len(idxes) / cpu_count()) chunksize = min(max(1, chunksize), 100) - logger.info("collecting blobs done") + logger.info("collecting blobs done, new blobs: %d", len(idxes)) for result in pool.imap_unordered(get_defs, idxes, chunksize): if result is not None: @@ -372,6 +372,8 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): ref_idxes = [(idx, db.defs.filename) for idx in idxes if getFileFamily(idx[2]) is not None] ref_chunksize = int(len(ref_idxes) / cpu_count()) ref_chunksize = min(max(1, ref_chunksize), 100) + logger.info("ref blobs: %d", len(ref_idxes)) + for result in pool.imap_unordered(call_get_refs, ref_idxes, ref_chunksize): if result is not None: add_refs(db, in_def_cache, idx_to_hash_and_filename, result) @@ -420,5 +422,6 @@ def ignore_sigint(): generate_defs_caches(db) logger.info("def caches generated") db.close() + logger.info("database closed") From 33d4a0c9a9509c71b6c2f91810a86d7c084032a6 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 15:01:39 +0200 Subject: [PATCH 24/29] dont filter ref_idxes by file family --- elixir/update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/elixir/update.py b/elixir/update.py index 2a6a1a3d..0b7b601a 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -369,7 +369,7 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): db.defs.open() in_def_cache = Cache(10000) - ref_idxes = [(idx, db.defs.filename) for idx in idxes if getFileFamily(idx[2]) is not None] + ref_idxes = [(idx, db.defs.filename) for idx in idxes] ref_chunksize = int(len(ref_idxes) / cpu_count()) ref_chunksize = min(max(1, ref_chunksize), 100) logger.info("ref blobs: %d", len(ref_idxes)) From d65d4e00dbe4934cf2dad12902cc0202ff77299f Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 15:22:43 +0200 Subject: [PATCH 25/29] put in cachedDB only if modified --- elixir/data.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/elixir/data.py b/elixir/data.py index eda0278b..3458d805 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -66,6 +66,7 @@ class DefList: def __init__(self, data=b'#'): data, self.families = data.split(b'#') + self.modified = False self.entries = OrderedDict() tmp_entries = [self.decode_entry(d) for d in deflist_regex.findall(data)] tmp_entries.sort(key=lambda x:int(x[0])) @@ -104,6 +105,7 @@ def append(self, id, type, line, family): if type not in defTypeD: return + self.modified = True if id not in self.entries: self.entries[id] = [(type, line, family)] else: @@ -158,6 +160,7 @@ def __init__(self, data=b''): self.entries = None self.to_append = [] self.sorted = False + self.modified = False def decode_entry(self, k): return (int(k[0].decode()), k[1].decode(), k[2].decode()) @@ -178,6 +181,7 @@ def iter(self, dummy=False): yield maxId, None, None def append(self, id, lines, family): + self.modified = True if self.entries is not None: self.entries.append((id, lines, family)) else: @@ -333,7 +337,8 @@ def put_raw(self, key, val, sync=False): def sync(self): if not self.readonly: for k, v in self.cache.items(): - self.put_raw(k, v) + if v.modified: + self.put_raw(k, v) self.db.sync() From a2dbb8fbd6ee9138588c70c45364679f641c89d3 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 19:11:26 +0200 Subject: [PATCH 26/29] def sync improvement --- elixir/data.py | 34 ++++++++++++++++++++-------------- elixir/update.py | 6 +++++- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index 3458d805..3eed7c4e 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -21,6 +21,7 @@ from typing import OrderedDict import berkeleydb import re +import time from . import lib from .lib import autoBytes import os @@ -64,7 +65,8 @@ class DefList: a line number and a file family. Also stores in which families the ident exists for faster tests.''' def __init__(self, data=b'#'): - data, self.families = data.split(b'#') + data, tmp_families = data.split(b'#') + self.families = tmp_families.decode() self.modified = False self.entries = OrderedDict() @@ -83,7 +85,7 @@ def decode_entry(self, entry): family = entry[3].decode() return id, type, line, family - def encode_entry(self, entry): + def encode_entry(self, entry) -> str: return str(entry[0]) + defTypeD[entry[1]] + str(entry[2]) + entry[3] def iter(self, dummy=False): @@ -94,14 +96,14 @@ def iter(self, dummy=False): if dummy: yield maxId, None, None, None - def exists(self, idx, line_num): + def exists(self, idx: int, line_num: int): for id, _, line, _ in self.entries: if id == idx and int(line) == line_num: return True return False - def append(self, id, type, line, family): + def append(self, id: int, type, line: int, family: str): if type not in defTypeD: return @@ -113,19 +115,18 @@ def append(self, id, type, line, family): self.add_family(family) - def pack(self): - data = ",".join(self.encode_entry((id, *entry)) for id, vals in self.entries.items() for entry in vals) - return data.encode() + b'#' + self.families - - def add_family(self, family): - family = family.encode() - if not family in self.families.split(b','): - if self.families != b'': - family = b',' + family + def pack(self) -> bytes: + data = ",".join([self.encode_entry((id, *entry)) for id, vals in self.entries.items() for entry in vals]) + return (data + '#' + self.families).encode() + + def add_family(self, family: str): + if not family in self.families.split(','): + if self.families != '': + family = ',' + family self.families += family def get_families(self): - return self.families.decode().split(',') + return self.families.split(',') def get_macros(self): return [entry[3] for val in self.entries.values() for entry in val if entry[1] == 'macro'] @@ -335,11 +336,16 @@ def put_raw(self, key, val, sync=False): self.db.sync() def sync(self): + start = time.time() + flushed = 0 if not self.readonly: for k, v in self.cache.items(): if v.modified: + v.modified = False self.put_raw(k, v) + flushed += 1 + print("synced", flushed, "/", len(self.cache), time.time()-start) self.db.sync() def close(self): diff --git a/elixir/update.py b/elixir/update.py index 0b7b601a..debaeff2 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -2,6 +2,7 @@ import logging import time import signal +import cProfile from multiprocessing import cpu_count, set_start_method from multiprocessing.pool import Pool from typing import Dict, Iterable, List, Optional, Tuple @@ -364,6 +365,7 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): logger.info("dts comps docs done") + #with cProfile.Profile() as pr: db.defs.close() db.defs.readonly = True db.defs.open() @@ -372,6 +374,8 @@ def update_version(db: DB, tag: bytes, pool: Pool, dts_comp_support: bool): ref_idxes = [(idx, db.defs.filename) for idx in idxes] ref_chunksize = int(len(ref_idxes) / cpu_count()) ref_chunksize = min(max(1, ref_chunksize), 100) + #pr.dump_stats("5refs"+str(int(time.time()))) + logger.info("ref blobs: %d", len(ref_idxes)) for result in pool.imap_unordered(call_get_refs, ref_idxes, ref_chunksize): @@ -408,7 +412,7 @@ def ignore_sigint(): set_start_method('spawn') with Pool(initializer=ignore_sigint) as pool: for tag in scriptLines('list-tags'): - #if not tag.startswith(b'v6.1') or b'rc' in tag: + #if not tag.startswith(b'v6'): # continue if sigint_caught: From 13814bddb4153f967969a7e6e31327b5669f7205 Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Sun, 6 Jul 2025 21:41:35 +0200 Subject: [PATCH 27/29] def pack/unpack optimization part 2 --- elixir/data.py | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index 3eed7c4e..b74fe54b 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -65,34 +65,25 @@ class DefList: a line number and a file family. Also stores in which families the ident exists for faster tests.''' def __init__(self, data=b'#'): - data, tmp_families = data.split(b'#') - self.families = tmp_families.decode() + data, self.families = data.split(b'#') self.modified = False self.entries = OrderedDict() - tmp_entries = [self.decode_entry(d) for d in deflist_regex.findall(data)] - tmp_entries.sort(key=lambda x:int(x[0])) + tmp_entries = [ + (int(d[0]), d[1], int(d[2]), d[3]) + for d in deflist_regex.findall(data) + ] for id, type, line, family in tmp_entries: if id not in self.entries: self.entries[id] = [(type, line, family)] else: self.entries[id].append((type, line, family)) - def decode_entry(self, entry): - id = int(entry[0]) - type = defTypeR [entry[1].decode()] - line = int(entry[2]) - family = entry[3].decode() - return id, type, line, family - - def encode_entry(self, entry) -> str: - return str(entry[0]) + defTypeD[entry[1]] + str(entry[2]) + entry[3] - def iter(self, dummy=False): # Get all element in a list of sublists and sort them for id, val in self.entries.items(): for type, line, family in val: - yield id, type, line, family + yield id, defTypeR[type.decode()], int(line), family.decode() if dummy: yield maxId, None, None, None @@ -109,27 +100,32 @@ def append(self, id: int, type, line: int, family: str): self.modified = True if id not in self.entries: - self.entries[id] = [(type, line, family)] + self.entries[id] = [(defTypeD[type].encode(), line, family.encode())] else: - self.entries[id].append((type, line, family)) + self.entries[id].append((defTypeD[type].encode(), line, family.encode())) self.add_family(family) def pack(self) -> bytes: - data = ",".join([self.encode_entry((id, *entry)) for id, vals in self.entries.items() for entry in vals]) - return (data + '#' + self.families).encode() + entries = [(id, *entry) for id, vals in self.entries.items() for entry in vals] + entries.sort(key=lambda x:int(x[0])) + data = b",".join([ + str(arg[0]).encode() + arg[1] + str(arg[2]).encode() + arg[3] + for arg in entries + ]) + return data + b'#' + self.families def add_family(self, family: str): - if not family in self.families.split(','): - if self.families != '': + if not family in self.families.split(b','): + if self.families != b'': family = ',' + family - self.families += family + self.families += family.encode() def get_families(self): - return self.families.split(',') + return [f.decode() for f in self.families.split(b',')] def get_macros(self): - return [entry[3] for val in self.entries.values() for entry in val if entry[1] == 'macro'] + return [entry[2].decode() for val in self.entries.values() for entry in val if entry[0] == b'M'] class PathList: '''Stores associations between a blob ID and a file path. From 99e7c930563924cb9e874a2fb6c23126ef9414ed Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Mon, 7 Jul 2025 13:49:27 +0200 Subject: [PATCH 28/29] optimize deflist for append --- elixir/data.py | 83 +++++++++++++++++++++++++++++------------------- elixir/update.py | 24 ++++++++++---- 2 files changed, 68 insertions(+), 39 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index b74fe54b..127ecaaf 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -65,29 +65,43 @@ class DefList: a line number and a file family. Also stores in which families the ident exists for faster tests.''' def __init__(self, data=b'#'): - data, self.families = data.split(b'#') + self.data, self.families = data.split(b'#') self.modified = False - self.entries = OrderedDict() - tmp_entries = [ - (int(d[0]), d[1], int(d[2]), d[3]) - for d in deflist_regex.findall(data) - ] - for id, type, line, family in tmp_entries: - if id not in self.entries: - self.entries[id] = [(type, line, family)] - else: - self.entries[id].append((type, line, family)) + self.entries = None + self.to_append = [] + + def populate_entries(self): + entries_modified = False + if self.entries is None: + self.entries = [ + (int(d[0]), d[1], int(d[2]), d[3]) + for d in deflist_regex.findall(self.data) + ] + entries_modified = True + + if len(self.to_append) != 0: + self.entries += self.to_append + self.to_append = [] + entries_modified = True + + if entries_modified: + self.entries.sort(key=lambda x:int(x[0])) def iter(self, dummy=False): # Get all element in a list of sublists and sort them - for id, val in self.entries.items(): - for type, line, family in val: - yield id, defTypeR[type.decode()], int(line), family.decode() + if self.entries is None: + self.populate_entries() + + for id, type, line, family in self.entries: + yield id, defTypeR[type.decode()], int(line), family.decode() if dummy: yield maxId, None, None, None def exists(self, idx: int, line_num: int): + if self.entries is None: + self.populate_entries() + for id, _, line, _ in self.entries: if id == idx and int(line) == line_num: return True @@ -99,21 +113,28 @@ def append(self, id: int, type, line: int, family: str): return self.modified = True - if id not in self.entries: - self.entries[id] = [(defTypeD[type].encode(), line, family.encode())] + if self.entries is None: + self.to_append.append((id, defTypeD[type].encode(), line, family.encode())) else: - self.entries[id].append((defTypeD[type].encode(), line, family.encode())) + self.entries.append((id, defTypeD[type].encode(), line, family.encode())) self.add_family(family) def pack(self) -> bytes: - entries = [(id, *entry) for id, vals in self.entries.items() for entry in vals] - entries.sort(key=lambda x:int(x[0])) - data = b",".join([ - str(arg[0]).encode() + arg[1] + str(arg[2]).encode() + arg[3] - for arg in entries - ]) - return data + b'#' + self.families + if self.entries is None: + to_append = b",".join([ + str(arg[0]).encode() + arg[1] + str(arg[2]).encode() + arg[3] + for arg in self.to_append + ]) + self.to_append = [] + self.data += to_append + return self.data + b'#' + self.families + else: + self.data = b",".join([ + str(arg[0]).encode() + arg[1] + str(arg[2]).encode() + arg[3] + for arg in self.entries + ]) + return self.data + b'#' + self.families def add_family(self, family: str): if not family in self.families.split(b','): @@ -125,7 +146,7 @@ def get_families(self): return [f.decode() for f in self.families.split(b',')] def get_macros(self): - return [entry[2].decode() for val in self.entries.values() for entry in val if entry[0] == b'M'] + return (deflist_macro_regex.findall(self.data.decode()) + [entry[1] for entry in self.to_append]) or '' class PathList: '''Stores associations between a blob ID and a file path. @@ -186,14 +207,11 @@ def append(self, id, lines, family): def pack(self): if self.entries is not None: - result = "" - for id, lines, family in self.entries: - result += str(id) + ":" + lines + ":" + family + "\n" + assert len(self.to_append) == 0 + result = "".join([str(id) + ":" + lines + ":" + family + "\n" for id, lines, family in self.entries]) return result.encode() - else: - result = "" - for id, lines, family in self.to_append: - result += str(id) + ":" + lines + ":" + family + "\n" + elif len(self.to_append) != 0: + result = "".join([str(id) + ":" + lines + ":" + family + "\n" for id, lines, family in self.to_append]) self.data += result.encode() self.to_append = [] return self.data @@ -307,6 +325,7 @@ def get(self, key): return p def get_keys(self): + self.sync() return self.db.keys() def put(self, key, val): diff --git a/elixir/update.py b/elixir/update.py index debaeff2..cd2d2493 100644 --- a/elixir/update.py +++ b/elixir/update.py @@ -2,6 +2,7 @@ import logging import time import signal +import bisect import cProfile from multiprocessing import cpu_count, set_start_method from multiprocessing.pool import Pool @@ -59,9 +60,15 @@ def put(self, key, val): # Check if definition for ident is visible in current version def def_in_version(def_ident: DefList, idx_to_hash_and_filename: IdxCache) -> bool: - for def_idx in def_ident.entries.keys(): + def_ident.populate_entries() + + prev_idx = None + for def_idx, _, _, _ in reversed(def_ident.entries): + if def_idx == prev_idx: + continue if def_idx in idx_to_hash_and_filename: return True + prev_idx = def_idx return False # Add definitions to database @@ -221,12 +228,15 @@ def get_refs(file_id: FileId, defs: CachedBsdDB) -> Optional[RefsDict]: line_num = 1 def deflist_exists(deflist, idx: int, line: int): - if idx not in deflist.entries: - return False - - for _, def_line, _ in deflist.entries[idx]: - if def_line == line: - return True + deflist.populate_entries() + start = bisect.bisect_left(deflist.entries, idx, key=lambda x: x[0]) + + for def_idx, _, def_line, _ in deflist.entries[start:]: + if def_idx == idx: + if def_line == line: + return True + else: + break return False From c789e019b304d2cc374c499cb4f15ae48a06939b Mon Sep 17 00:00:00 2001 From: Franciszek Stachura Date: Mon, 7 Jul 2025 14:55:27 +0200 Subject: [PATCH 29/29] put in evict only if necessary --- elixir/data.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/elixir/data.py b/elixir/data.py index 127ecaaf..74116af1 100755 --- a/elixir/data.py +++ b/elixir/data.py @@ -319,7 +319,7 @@ def get(self, key): self.cache.move_to_end(key) if len(self.cache) > self.cachesize: old_k, old_v = self.cache.popitem(last=False) - if not self.readonly: + if old_v.modified: self.put_raw(old_k, old_v) return p @@ -336,7 +336,8 @@ def put(self, key, val): self.cache.move_to_end(key) if len(self.cache) > self.cachesize: old_k, old_v = self.cache.popitem(last=False) - self.put_raw(old_k, old_v) + if old_v.modified: + self.put_raw(old_k, old_v) def put_raw(self, key, val, sync=False): if self.readonly: