feat(search): custom tokenizer

2024-06-08 09:30:46 +00:00
parent 7e93c8529a
commit ebdb9de9ba
1 changed files with 38 additions and 2 deletions
--- a/.vitepress/constants.ts
+++ b/.vitepress/constants.ts
@@ -19,18 +19,54 @@ export const feedback = `<a href="/feedback" class="feedback-footer">Made with
 export const search: DefaultTheme.Config['search'] = {
  options: {
    miniSearch: {
+      options: {
+        tokenize: (text) => text.split(/[\n\r #%*,=/:;?[\]{}()&]+/u), // simplified charset: removed [-_.@] and non-english chars (diacritics etc.)
+        processTerm: (term, fieldName) => {
+          term = term
+            .trim()
+            .toLowerCase()
+            .replace(/^\.+/, '')
+            .replace(/\.+$/, '')
+          const stopWords = [
+            'frontmatter',
+            '$frontmatter.synopsis',
+            'and',
+            'about',
+            'but',
+            'now',
+            'the',
+            'with',
+            'you'
+          ]
+          if (term.length < 2 || stopWords.includes(term)) return false
+
+          if (fieldName === 'text') {
+            const parts = term.split('.')
+            if (parts.length > 1) {
+              const newTerms = [term, ...parts]
+                .filter((t) => t.length >= 2)
+                .filter((t) => !stopWords.includes(t))
+              return newTerms
+            }
+          }
+          return term
+        }
+      },
      searchOptions: {
        combineWith: 'AND',
-        fuzzy: false,
+        fuzzy: true,
        // @ts-ignore
        boostDocument: (
-          _,
+          documentId,
          term,
          storedFields: Record<string, string | string[]>
        ) => {
          const titles = (storedFields?.titles as string[])
            .filter((t) => Boolean(t))
            .map((t) => t.toLowerCase())
+          // Downrank posts
+          if (documentId.match(/\/posts/)) return -5
+
          // Uprate if term appears in titles. Add bonus for higher levels (i.e. lower index)
          const titleIndex =
            titles