Skip to content

Commit ea45b49

Browse files
Merge pull request #16 from clj-codes/feat/even-better-indexes
fix: consider full symbol as index
2 parents d9c61d6 + d301805 commit ea45b49

File tree

4 files changed

+57
-18
lines changed

4 files changed

+57
-18
lines changed

.clj-kondo/config.edn

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{:lint-as {datalevin.interpret/inter-fn clojure.core/fn}}

.lsp/config.edn

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@
55
flow [[:block 1]]
66
flow-with-defaults [[:block 1]]
77
flow-as-of [[:block 1]]
8-
flow-without-validation [[:block 1]]}}}
8+
flow-without-validation [[:block 1]]
9+
inter-fn [[:inner 0] [:inner 1]]}}}

dev/playground.clj

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
[codes.clj.docs.extractor.core :as core]
55
[codes.clj.docs.extractor.datalevin :as datalevin]
66
[datalevin.core :as d]
7+
[datalevin.interpret :refer [inter-fn]]
78
[datalevin.search-utils :as su]
89
[datalevin.util :as util])
910
(:import [java.io File]))
@@ -58,7 +59,7 @@
5859
db (d/db conn)
5960

6061
datoms (->> (d/fulltext-datoms db
61-
"ass"
62+
"."
6263
{:top 30
6364
:domains ["definition-name"
6465
"namespace-name"
@@ -143,15 +144,19 @@
143144

144145
; tests with fulltext and analyzer
145146
(let [query-analyzer (su/create-analyzer
146-
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
147+
{:tokenizer (datalevin/merge-tokenizers
148+
(inter-fn [s] [[s 0 0]])
149+
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
147150
:token-filters [su/lower-case-token-filter]})
148151

149152
analyzer (su/create-analyzer
150-
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
153+
{:tokenizer (datalevin/merge-tokenizers
154+
(inter-fn [s] [[s 0 0]])
155+
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
151156
:token-filters [su/lower-case-token-filter
152157
su/prefix-token-filter]})
153158

154-
dir "/tmp/mydb"
159+
dir (str "/tmp/mydb-" (random-uuid))
155160
conn (d/create-conn dir
156161
{:text {:db/valueType :db.type/string
157162
:db/fulltext true
@@ -169,35 +174,46 @@
169174
{:text "associative?"}
170175
{:text "b"}
171176
{:text "ba"}
172-
{:text "bas"}]
177+
{:text "bas"}
178+
{:text "*"}
179+
{:text "/"}
180+
{:text "->"}
181+
{:text "->>"}
182+
{:text "as->"}
183+
{:text "."}
184+
{:text "as->banana"}]
173185

174186
_transact (d/transact! conn data)
175187

176-
result (->> (d/q '[:find ?i
188+
result (->> (d/q '[:find ?e ?v
177189
:in $ ?q
178190
:where
179-
[(fulltext $ ?q {:top 20}) [[?e]]]
180-
[?e :text ?i]]
191+
[(fulltext $ ?q {:top 20}) [[?e ?a ?v]]]]
181192
(d/db conn)
182-
"assoc-me")
193+
"as")
183194
doall)]
184195

185196
(d/close conn)
186197
(util/delete-files dir)
187198

188199
result)
189200

190-
; tests with fulltext and analyzer on a raw query
201+
; tests with fulltext and analyzer on a raw query
191202
(let [query-analyzer (su/create-analyzer
192-
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
203+
{:tokenizer (datalevin/merge-tokenizers
204+
(inter-fn [s] [[s 0 0]])
205+
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
193206
:token-filters [su/lower-case-token-filter]})
194207

195208
analyzer (su/create-analyzer
196-
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
209+
{:tokenizer (datalevin/merge-tokenizers
210+
(inter-fn [s] [[s 0 0]])
211+
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
197212
:token-filters [su/lower-case-token-filter
198213
su/prefix-token-filter]})
199214

200-
lmdb (d/open-kv "/tmp/mydb")
215+
dir (str "/tmp/lmdb-" (random-uuid))
216+
lmdb (d/open-kv dir)
201217

202218
engine (d/new-search-engine lmdb {:query-analyzer query-analyzer
203219
:analyzer analyzer
@@ -213,13 +229,21 @@
213229
7 "associative?"
214230
8 "b"
215231
9 "ba"
216-
10 "bas"}
232+
10 "bas"
233+
11 "->"
234+
12 "->>"
235+
13 "as->"
236+
14 "as->banana"
237+
15 "/"
238+
16 "*"
239+
17 "."}
217240

218241
_transact (doseq [[k v] input]
219242
(d/add-doc engine k v))
220243

221-
result (doall (d/search engine "assoc-m" {:top 20 :display :texts}))]
244+
result (doall (d/search engine "->" {:top 20 :display :texts}))]
222245

223246
(d/close-kv lmdb)
247+
(util/delete-files dir)
224248

225249
result))

src/codes/clj/docs/extractor/datalevin.clj

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
(ns codes.clj.docs.extractor.datalevin
22
(:require [datalevin.core :as d]
3+
[datalevin.interpret :refer [inter-fn]]
34
[datalevin.search-utils :as su]))
45

56
;; TODO: add id :db.unique/identity and ref :db.type/ref
@@ -78,12 +79,24 @@
7879
(def db-schemas
7980
(merge project-schema namespace-schema definition-schema))
8081

82+
(defn merge-tokenizers
83+
"Merges the results of tokenizer a and b into one sequence."
84+
[tokenizer-a tokenizer-b]
85+
(inter-fn [^String s]
86+
(into (sequence (tokenizer-a s))
87+
(sequence (tokenizer-b s)))))
88+
8189
(defn bulk-transact! [datoms config]
8290
(let [query-analyzer (su/create-analyzer
83-
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
91+
{:tokenizer (merge-tokenizers
92+
(inter-fn [s] [[s 0 0]])
93+
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
8494
:token-filters [su/lower-case-token-filter]})
95+
8596
analyzer (su/create-analyzer
86-
{:tokenizer (su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+")
97+
{:tokenizer (merge-tokenizers
98+
(inter-fn [s] [[s 0 0]])
99+
(su/create-regexp-tokenizer #"[\s:/\.;,!=?\"'()\[\]{}|<>&@#^*\\~`\-]+"))
87100
:token-filters [su/lower-case-token-filter
88101
su/prefix-token-filter]})
89102
conn (-> config :db :dir

0 commit comments

Comments
 (0)