go fmt

Allow b elements in snippets
Index hashtags and grep the rest
2023-09-26 00:07:34 +02:00 · 2023-09-25 17:20:35 +02:00 · 2023-09-25 16:31:33 +02:00 · 2023-09-25 14:07:00 +02:00 · 2023-09-25 09:21:13 +02:00
24 changed files with 392 additions and 100 deletions
--- a/accounts.go
+++ b/accounts.go
@@ -2,10 +2,10 @@ package main

 import (
 	"encoding/json"
-	"fmt"
 	"github.com/gomarkdown/markdown/ast"
 	"github.com/gomarkdown/markdown/parser"
 	"io"
+	"log"
 	"net/http"
 	"os"
 	"sync"
@@ -82,7 +82,7 @@ func account(p *parser.Parser, data []byte, offset int) (int, ast.Node) {
 	uri, ok := accounts.uris[string(account)]
 	defer accounts.RUnlock()
 	if !ok {
-		fmt.Printf("Looking up %s\n", account)
+		log.Printf("Looking up %s\n", account)
 		uri = "https://" + string(domain) + "/users/" + string(user[1:])
 		accounts.uris[string(account)] = uri // prevent more lookings
 		go lookUpAccountUri(string(account), string(domain))
@@ -103,26 +103,26 @@ func lookUpAccountUri(account, domain string) {
 	uri := "https://" + domain + "/.well-known/webfinger"
 	resp, err := http.Get(uri + "?resource=acct:" + account)
 	if err != nil {
-		fmt.Printf("Failed to look up %s: %s\n", account, err)
+		log.Printf("Failed to look up %s: %s", account, err)
 		return
 	}
 	defer resp.Body.Close()
 	body, err := io.ReadAll(resp.Body)
 	if err != nil {
-		fmt.Printf("Failed to read from %s: %s\n", account, err)
+		log.Printf("Failed to read from %s: %s", account, err)
 		return
 	}
 	var wf WebFinger
 	err = json.Unmarshal([]byte(body), &wf)
 	if err != nil {
-		fmt.Printf("Failed to parse the JSON from %s: %s\n", account, err)
+		log.Printf("Failed to parse the JSON from %s: %s", account, err)
 		return
 	}
 	uri, err = parseWebFinger(body)
 	if err != nil {
-		fmt.Printf("Could not find profile URI for %s: %s\n", account, err)
+		log.Printf("Could not find profile URI for %s: %s", account, err)
 	}
-	fmt.Printf("Found profile for %s: %s\n", account, uri)
+	log.Printf("Found profile for %s: %s", account, uri)
 	accounts.Lock()
 	defer accounts.Unlock()
 	accounts.uris[account] = uri
--- a/add_append_test.go
+++ b/add_append_test.go
@@ -29,7 +29,6 @@ It's not `)}
 	HTTPRedirectTo(t, makeHandler(appendHandler, true), "POST", "/append/testdata/fire", data, "/view/testdata/fire")
 	assert.Regexp(t, regexp.MustCompile("It’s not barbecue"),
 		assert.HTTPBody(makeHandler(viewHandler, true), "GET", "/view/testdata/fire", nil))
-
 	t.Cleanup(func() {
 		_ = os.RemoveAll("testdata")
 	})
--- a/feed.go
+++ b/feed.go
@@ -1,11 +1,11 @@
 package main

 import (
+	"bytes"
 	"github.com/gomarkdown/markdown"
 	"github.com/gomarkdown/markdown/ast"
 	"github.com/gomarkdown/markdown/parser"
 	"html/template"
-	"bytes"
 	"os"
 	"path"
 	"time"
--- a/feed_test.go
+++ b/feed_test.go
@@ -7,7 +7,7 @@ import (
 )

 func TestFeed(t *testing.T) {
-	assert.Contains(t, 
+	assert.Contains(t,
 		assert.HTTPBody(makeHandler(viewHandler, true), "GET", "/view/index.rss", nil),
 		"Welcome to Oddµ")
 }
@@ -16,7 +16,7 @@ func TestFeed(t *testing.T) {
 func TestFeedItems(t *testing.T) {
 	_ = os.RemoveAll("testdata")
 	index.load()
-	
+
 	p1 := &Page{Name: "testdata/cactus", Body: []byte(`# Cactus
 Green head and white hair
 A bench in the evening sun
--- a/go.mod
+++ b/go.mod
@@ -4,7 +4,6 @@ go 1.21.0

 require (
 	github.com/anthonynsimon/bild v0.13.0
-	github.com/dgryski/go-trigram v0.0.0-20160407183937-79ec494e1ad0
 	github.com/gomarkdown/markdown v0.0.0-20230912175223-14b07df9d538
 	github.com/google/subcommands v1.2.0
 	github.com/hexops/gotextdiff v1.0.3
--- a/go.sum
+++ b/go.sum
@@ -10,8 +10,6 @@ github.com/coreos/go-semver v0.2.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3Ee
 github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/dgryski/go-trigram v0.0.0-20160407183937-79ec494e1ad0 h1:b+7JSiBM+hnLQjP/lXztks5hnLt1PS46hktG9VOJgzo=
-github.com/dgryski/go-trigram v0.0.0-20160407183937-79ec494e1ad0/go.mod h1:qzKC/DpcxK67zaSHdCmIv3L9WJViHVinYXN2S7l3RM8=
 github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo=
 github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
 github.com/gomarkdown/markdown v0.0.0-20230912175223-14b07df9d538 h1:ePDpFu7l0QUV46/9A7icfL2wvIOzTJLCWh4RO2NECzE=
--- a/html_cmd.go
+++ b/html_cmd.go
@@ -30,7 +30,7 @@ func (cmd *htmlCmd) Execute(_ context.Context, f *flag.FlagSet, _ ...interface{}
 }

 func htmlCli(w io.Writer, useTemplate bool, args []string) subcommands.ExitStatus {
-		for _, arg := range args {
+	for _, arg := range args {
 		p, err := loadPage(arg)
 		if err != nil {
 			fmt.Fprintf(w, "Cannot load %s: %s\n", arg, err)
--- a/html_cmd_test.go
+++ b/html_cmd_test.go
@@ -2,9 +2,9 @@ package main

 import (
 	"bytes"
+	"github.com/google/subcommands"
 	"github.com/stretchr/testify/assert"
 	"testing"
-	"github.com/google/subcommands"
 )

 func TestHtmlCmd(t *testing.T) {
--- a/index.go
+++ b/index.go
@@ -1,43 +1,97 @@
+// Read Artem Krylysov's blog post on full text search as an
+// introduction.
+// https://artem.krylysov.com/blog/2020/07/28/lets-build-a-full-text-search-engine/
+
 package main

 import (
-	trigram "github.com/dgryski/go-trigram"
 	"io/fs"
+	"log"
 	"path/filepath"
+	"sort"
 	"strings"
 	"sync"
 )

+type docid uint
+
 // Index contains the two maps used for search. Make sure to lock and
 // unlock as appropriate.
 type Index struct {
 	sync.RWMutex

-	// index is a struct containing the trigram index for search.
-	// It is generated at startup and updated after every page
-	// edit. The index is case-insensitive.
-	index trigram.Index
+	// next_id is the number of the next document added to the index
+	next_id docid

-	// documents is a map, mapping document ids of the index to
-	// page names.
-	documents map[trigram.DocID]string
+	// index is an inverted index mapping tokens to document ids.
+	token map[string][]docid

-	// names is a map, mapping page names to titles.
+	// documents is a map, mapping document ids to page names.
+	documents map[docid]string
+
+	// titles is a map, mapping page names to titles.
 	titles map[string]string
 }

-// idx is the global Index per wiki.
 var index Index

 // reset resets the Index. This assumes that the index is locked!
 func (idx *Index) reset() {
-	idx.index = nil
+	idx.token = nil
 	idx.documents = nil
 	idx.titles = nil
 }

+// addDocument adds the text as a new document. This assumes that the
+// index is locked!
+func (idx *Index) addDocument(text []byte) docid {
+	id := idx.next_id
+	idx.next_id++
+	for _, token := range hashtags(text) {
+		ids := idx.token[token]
+		// Don't add same ID more than once. Checking the last
+		// position of the []docid works because the id is
+		// always a new one, i.e. the last one, if at all.
+		if ids != nil && ids[len(ids)-1] == id {
+			continue
+		}
+		idx.token[token] = append(ids, id)
+	}
+	return id
+}
+
+// deleteDocument deletes the text as a new document. The id can no
+// longer be used. This assumes that the index is locked!
+func (idx *Index) deleteDocument(text []byte, id docid) {
+	for _, token := range hashtags(text) {
+		ids := index.token[token]
+		// Tokens can appear multiple times in a text but they
+		// can only be deleted once. deleted.
+		if ids == nil {
+			continue
+		}
+		// If the token appears only in this document, remove
+		// the whole entry.
+		if len(ids) == 1 && ids[0] == id {
+			delete(index.token, token)
+			continue
+		}
+		// Otherwise, remove the token from the index.
+		i := sort.Search(len(ids), func(i int) bool { return ids[i] >= id })
+		if i != -1 && i < len(ids) && ids[i] == id {
+			copy(ids[i:], ids[i+1:])
+			index.token[token] = ids[:len(ids)-1]
+			continue
+		}
+		// If none of the above, then our docid wasn't
+		// indexed. This shouldn't happen, either.
+		log.Printf("The index for token %s does not contain doc id %d", token, id)
+	}
+	delete(index.documents, id)
+}
+
 // add reads a file and adds it to the index. This must happen while
-// the idx is locked, which is true when called from loadIndex.
+// the idx is locked.
 func (idx *Index) add(path string, info fs.FileInfo, err error) error {
 	if err != nil {
 		return err
@@ -52,7 +106,8 @@ func (idx *Index) add(path string, info fs.FileInfo, err error) error {
 		return err
 	}
 	p.handleTitle(false)
-	id := idx.index.Add(strings.ToLower(string(p.Body)))
+
+	id := idx.addDocument(p.Body)
 	idx.documents[id] = p.Name
 	idx.titles[p.Name] = p.Title
 	return nil
@@ -63,8 +118,8 @@ func (idx *Index) add(path string, info fs.FileInfo, err error) error {
 func (idx *Index) load() (int, error) {
 	idx.Lock()
 	defer idx.Unlock()
-	idx.index = make(trigram.Index)
-	idx.documents = make(map[trigram.DocID]string)
+	idx.token = make(map[string][]docid)
+	idx.documents = make(map[docid]string)
 	idx.titles = make(map[string]string)
 	err := filepath.Walk(".", idx.add)
 	if err != nil {
@@ -75,15 +130,23 @@ func (idx *Index) load() (int, error) {
 	return n, nil
 }

+// dump prints the index to the log for debugging. Must already be readlocked.
+func (idx *Index) dump() {
+	index.RLock()
+	defer index.RUnlock()
+	for token, ids := range idx.token {
+		log.Printf("%s: %v", token, ids)
+	}
+}
+
 // updateIndex updates the index for a single page. The old text is
 // loaded from the disk and removed from the index first, if it
 // exists.
 func (p *Page) updateIndex() {
 	index.Lock()
 	defer index.Unlock()
-	var id trigram.DocID
-	// This function does not rely on files actually existing, so
-	// let's quickly find the document id.
+	var id docid
+	// Reverse lookup! At least it's in memory.
 	for docId, name := range index.documents {
 		if name == p.Name {
 			id = docId
@@ -91,33 +154,120 @@ func (p *Page) updateIndex() {
 		}
 	}
 	if id == 0 {
-		id = index.index.Add(strings.ToLower(string(p.Body)))
+		id = index.addDocument(p.Body)
 		index.documents[id] = p.Name
+		index.titles[p.Name] = p.Title
 	} else {
-		o, err := loadPage(p.Name)
-		if err == nil {
-			index.index.Delete(strings.ToLower(string(o.Body)), id)
-			o.handleTitle(false)
-			delete(index.titles, o.Title)
+		if o, err := loadPage(p.Name); err == nil {
+			index.deleteDocument(o.Body, id)
 		}
-		index.index.Insert(strings.ToLower(string(p.Body)), id)
+		// Do not reuse the old id. We need a new one for
+		// indexing to work.
+		id = index.addDocument(p.Body)
+		index.documents[id] = p.Name
 		p.handleTitle(false)
+		// The page name stays the same but the title may have
+		// changed.
 		index.titles[p.Name] = p.Title
 	}
 }

-// searchDocuments searches the index for a string. This requires the
-// index to be locked.
-func searchDocuments(q string) []string {
-	words := strings.Fields(strings.ToLower(q))
-	var trigrams []trigram.T
-	for _, word := range words {
-		trigrams = trigram.Extract(word, trigrams)
+// removeFromIndex removes the page from the index. Do this when
+// deleting a page.
+func (p *Page) removeFromIndex() {
+	index.Lock()
+	defer index.Unlock()
+	var id docid
+	// Reverse lookup! At least it's in memory.
+	for docId, name := range index.documents {
+		if name == p.Name {
+			id = docId
+			break
+		}
 	}
-	ids := index.index.QueryTrigrams(trigrams)
-	names := make([]string, len(ids))
-	for i, id := range ids {
-		names[i] = index.documents[id]
+	if id == 0 {
+		log.Printf("Page %s is not indexed", p.Name)
+		return
 	}
-	return names
+	o, err := loadPage(p.Name)
+	if err != nil {
+		log.Printf("Page %s cannot removed from the index: %s", p.Name, err)
+		return
+	}
+	index.deleteDocument(o.Body, id)
+}
+
+// search searches the index for a query string and returns page
+// names.
+func (idx *Index) search(q string) []string {
+	index.RLock()
+	defer index.RUnlock()
+	names := make([]string, 0)
+	hashtags := hashtags([]byte(q))
+	if len(hashtags) > 0 {
+		var r []docid
+		for _, token := range hashtags {
+			if ids, ok := idx.token[token]; ok {
+				if r == nil {
+					r = ids
+				} else {
+					r = intersection(r, ids)
+				}
+			} else {
+				// Token doesn't exist therefore abort search.
+				return nil
+			}
+		}
+		for _, id := range r {
+			names = append(names, idx.documents[id])
+		}
+	} else {
+		for _, name := range idx.documents {
+			names = append(names, name)
+		}
+	}
+	return grep(tokens(q), names)
+}
+
+func grep(tokens, names []string) []string {
+	results := make([]string, 0)
+NameLoop:
+	for _, name := range names {
+		p, err := loadPage(name)
+		if err != nil {
+			log.Printf("Cannot load %s: %s", name, err)
+			continue
+		}
+		body := strings.ToLower(string(p.Body))
+		for _, token := range tokens {
+			if !strings.Contains(body, token) {
+				continue NameLoop
+			}
+		}
+		results = append(results, name)
+	}
+	return results
+}
+
+// intersection returns the set intersection between a and b.
+// a and b have to be sorted in ascending order and contain no duplicates.
+func intersection(a []docid, b []docid) []docid {
+	maxLen := len(a)
+	if len(b) > maxLen {
+		maxLen = len(b)
+	}
+	r := make([]docid, 0, maxLen)
+	var i, j int
+	for i < len(a) && j < len(b) {
+		if a[i] < b[j] {
+			i++
+		} else if a[i] > b[j] {
+			j++
+		} else {
+			r = append(r, a[i])
+			i++
+			j++
+		}
+	}
+	return r
 }
--- a/index_test.go
+++ b/index_test.go
@@ -27,9 +27,10 @@ func TestSearchHashtag(t *testing.T) {
 	assert.NotZero(t, len(pages))
 }

+// wipes testdata
 func TestIndexUpdates(t *testing.T) {
-	name := "test"
-	_ = os.Remove(name + ".md")
+	_ = os.RemoveAll("testdata")
+	name := "testdata/test"
 	index.load()
 	p := &Page{Name: name, Body: []byte("This is a test.")}
 	p.save()
@@ -92,6 +93,6 @@ func TestIndexUpdates(t *testing.T) {
 	assert.True(t, found)

 	t.Cleanup(func() {
-		_ = os.Remove(name + ".md")
+		_ = os.RemoveAll("testdata")
 	})
 }
--- a/page.go
+++ b/page.go
@@ -2,9 +2,9 @@ package main

 import (
 	"bytes"
-	"fmt"
 	"github.com/microcosm-cc/bluemonday"
 	"html/template"
+	"log"
 	"net/url"
 	"os"
 	"path/filepath"
@@ -27,10 +27,12 @@ type Page struct {
 	Hashtags []string
 }

-// santize uses bluemonday to sanitize the HTML.
-// No exceptions are made because this is used for snippets.
+// santize uses bluemonday to sanitize the HTML. An exceptions is made
+// for the b tag because this is used for snippets.
 func sanitizeStrict(s string) template.HTML {
-	return template.HTML(bluemonday.StrictPolicy().Sanitize(s))
+	policy := bluemonday.StrictPolicy()
+	policy.AllowElements("b")
+	return template.HTML(policy.Sanitize(s))
 }

 // santizeBytes uses bluemonday to sanitize the HTML.
@@ -60,6 +62,7 @@ func (p *Page) save() error {
 	filename := p.Name + ".md"
 	s := bytes.ReplaceAll(p.Body, []byte{'\r'}, []byte{})
 	if len(s) == 0 {
+		p.removeFromIndex()
 		_ = os.Rename(filename, filename+"~")
 		return os.Remove(filename)
 	}
@@ -69,7 +72,7 @@ func (p *Page) save() error {
 	if d != "." {
 		err := os.MkdirAll(d, 0755)
 		if err != nil {
-			fmt.Printf("Creating directory %s failed", d)
+			log.Printf("Creating directory %s failed: %s", d, err)
 			return err
 		}
 	}
--- a/parser.go
+++ b/parser.go
@@ -35,7 +35,7 @@ func wikiLink(p *parser.Parser, fn func(p *parser.Parser, data []byte, offset in
 // hashtag returns an inline parser function. This indirection is
 // required because we want to receive an array of hashtags found.
 func hashtag() (func(p *parser.Parser, data []byte, offset int) (int, ast.Node), *[]string) {
-	hashtags := make([]string,0)
+	hashtags := make([]string, 0)
 	return func(p *parser.Parser, data []byte, offset int) (int, ast.Node) {
 		data = data[offset:]
 		i := 0
@@ -49,8 +49,8 @@ func hashtag() (func(p *parser.Parser, data []byte, offset int) (int, ast.Node),
 		hashtags = append(hashtags, string(data[1:i]))
 		link := &ast.Link{
 			AdditionalAttributes: []string{`class="tag"`},
-			Destination: append([]byte("/search?q=%23"), data[1:i]...),
-			Title:       data[0:i],
+			Destination:          append([]byte("/search?q=%23"), data[1:i]...),
+			Title:                data[0:i],
 		}
 		text := bytes.ReplaceAll(data[0:i], []byte("_"), []byte(" "))
 		ast.AppendChild(link, &ast.Text{Leaf: ast.Leaf{Literal: text}})
--- a/replace_cmd_test.go
+++ b/replace_cmd_test.go
@@ -2,8 +2,8 @@ package main

 import (
 	"bytes"
-	"github.com/stretchr/testify/assert"
 	"github.com/google/subcommands"
+	"github.com/stretchr/testify/assert"
 	"os"
 	"testing"
 )
@@ -32,7 +32,7 @@ You are no planet`)}
 1 change was made.
 This is a dry run. Use -confirm to make it happen.
 `
-	
+
 	b := new(bytes.Buffer)
 	s := replaceCli(b, false, []string{`\bno planet`, `planetoid`})
 	assert.Equal(t, subcommands.ExitSuccess, s)
--- a/search.go
+++ b/search.go
@@ -1,7 +1,7 @@
 package main

 import (
-	"fmt"
+	"log"
 	"net/http"
 	"slices"
 	"strconv"
@@ -70,7 +70,7 @@ func load(names []string) []*Page {
 	for i, name := range names {
 		p, err := loadPage(name)
 		if err != nil {
-			fmt.Printf("Error loading %s\n", name)
+			log.Printf("Error loading %s: %s", name, err)
 		} else {
 			items[i] = p
 		}
@@ -89,10 +89,8 @@ func search(q string, page int) ([]*Page, bool, int) {
 	if len(q) == 0 {
 		return make([]*Page, 0), false, 0
 	}
-	index.RLock()
-	names := searchDocuments(q)
+	names := index.search(q)
 	slices.SortFunc(names, sortNames(q))
-	index.RUnlock()
 	from := itemsPerPage * (page - 1)
 	if from > len(names) {
 		return make([]*Page, 0), false, 0
--- a/search_cmd.go
+++ b/search_cmd.go
@@ -1,20 +1,27 @@
 package main

 import (
+	"bytes"
 	"context"
 	"flag"
 	"fmt"
 	"github.com/google/subcommands"
 	"io"
+	"io/fs"
 	"os"
+	"path/filepath"
+	"slices"
+	"strings"
 )

 type searchCmd struct {
-	page int
+	page  int
+	exact bool
 }

 func (cmd *searchCmd) SetFlags(f *flag.FlagSet) {
 	f.IntVar(&cmd.page, "page", 1, "the page in the search result set")
+	f.BoolVar(&cmd.exact, "exact", false, "look for exact matches (do not use the trigram index)")
 }

 func (*searchCmd) Name() string     { return "search" }
@@ -29,15 +36,21 @@ func (*searchCmd) Usage() string {
 }

 func (cmd *searchCmd) Execute(_ context.Context, f *flag.FlagSet, _ ...interface{}) subcommands.ExitStatus {
-	return searchCli(os.Stdout, cmd.page, f.Args())
+	return searchCli(os.Stdout, cmd.page, cmd.exact, f.Args())
 }

 // searchCli runs the search command on the command line. It is used
 // here with an io.Writer for easy testing.
-func searchCli(w io.Writer, n int, args []string) subcommands.ExitStatus {
-	index.load()
+func searchCli(w io.Writer, n int, exact bool, args []string) subcommands.ExitStatus {
+	var fn func(q string, n int) ([]*Page, bool, int)
+	if exact {
+		fn = searchExact
+	} else {
+		index.load()
+		fn = search
+	}
 	for _, q := range args {
-		items, more, _ := search(q, n)
+		items, more, _ := fn(q, n)
 		if len(items) == 1 {
 			fmt.Fprintf(w, "Search for %s, page %d: 1 result\n", q, n)
 		} else {
@@ -52,3 +65,58 @@ func searchCli(w io.Writer, n int, args []string) subcommands.ExitStatus {
 	}
 	return subcommands.ExitSuccess
 }
+
+// searchExact opens all the files and searches them, one by one.
+func searchExact(q string, page int) ([]*Page, bool, int) {
+	if len(q) == 0 {
+		return make([]*Page, 0), false, 0
+	}
+	terms := bytes.Fields([]byte(q))
+	pages := make(map[string]*Page)
+	names := make([]string, 0)
+	index.titles = make(map[string]string)
+	err := filepath.Walk(".", func(path string, info fs.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		filename := path
+		if info.IsDir() || strings.HasPrefix(filename, ".") || !strings.HasSuffix(filename, ".md") {
+			return nil
+		}
+		name := strings.TrimSuffix(filename, ".md")
+		p, err := loadPage(name)
+		if err != nil {
+			return err
+		}
+		for _, term := range terms {
+			if !bytes.Contains(p.Body, term) {
+				return nil
+			}
+		}
+		p.handleTitle(false)
+		pages[p.Name] = p
+		index.titles[p.Name] = p.Title
+		names = append(names, p.Name)
+		return nil
+	})
+	if err != nil {
+		return make([]*Page, 0), false, 0
+	}
+	slices.SortFunc(names, sortNames(q))
+	from := itemsPerPage * (page - 1)
+	if from > len(names) {
+		return make([]*Page, 0), false, 0
+	}
+	to := from + itemsPerPage
+	if to > len(names) {
+		to = len(names)
+	}
+	items := make([]*Page, 0)
+	for i := from; i < to; i++ {
+		p := pages[names[i]]
+		p.score(q)
+		p.summarize(q)
+		items = append(items, p)
+	}
+	return items, to < len(names), len(names)/itemsPerPage + 1
+}
--- a/search_cmd_test.go
+++ b/search_cmd_test.go
@@ -2,14 +2,14 @@ package main

 import (
 	"bytes"
+	"github.com/google/subcommands"
 	"github.com/stretchr/testify/assert"
 	"testing"
-	"github.com/google/subcommands"
 )

 func TestSearchCmd(t *testing.T) {
 	b := new(bytes.Buffer)
-	s := searchCli(b, 1, []string{"oddµ"})
+	s := searchCli(b, 1, false, []string{"oddµ"})
 	assert.Equal(t, subcommands.ExitSuccess, s)
 	r := `Search for oddµ, page 1: 2 results
 * [Oddµ: A minimal wiki](README) (5)
--- a/search_test.go
+++ b/search_test.go
@@ -3,8 +3,8 @@ package main
 import (
 	"github.com/stretchr/testify/assert"
 	"net/url"
-	"testing"
 	"os"
+	"testing"
 )

 func TestSearch(t *testing.T) {
@@ -19,12 +19,14 @@ func TestSearchQuestionmark(t *testing.T) {
 	_ = os.RemoveAll("testdata")
 	p := &Page{Name: "testdata/Odd?", Body: []byte(`# Even?

-yes or no?`)}
+We look at the plants.
+They need water. We need us.
+The silence streches.`)}
 	p.save()
 	data := url.Values{}
-	data.Set("q", "yes")
+	data.Set("q", "look")
 	body := assert.HTTPBody(searchHandler, "GET", "/search", data)
-	assert.Contains(t, body, "yes or no?")
+	assert.Contains(t, body, "We <b>look</b>")
 	assert.NotContains(t, body, "Odd?")
 	assert.Contains(t, body, "Even?")
 }
--- a/snippets.go
+++ b/snippets.go
@@ -1,20 +1,21 @@
 package main

 import (
+	"log"
 	"regexp"
 	"strings"
 )

 // re returns a regular expression matching any word in q.
 func re(q string) (*regexp.Regexp, error) {
-	q = regexp.QuoteMeta(q)
-	re, err := regexp.Compile(`\s+`)
-	if err != nil {
-		return nil, err
+	fields := strings.Fields(q)
+	quoted := make([]string, len(fields))
+	for i, w := range fields {
+		quoted[i] = regexp.QuoteMeta(w)
 	}
-	words := re.ReplaceAllString(q, "|")
-	re, err = regexp.Compile(`(?i)(` + words + `)`)
+	re, err := regexp.Compile(`(?i)(` + strings.Join(quoted, "|") + `)`)
 	if err != nil {
+		log.Printf("Cannot compile %s %v: %s", q, quoted, err)
 		return nil, err
 	}
 	return re, nil
@@ -26,12 +27,16 @@ func snippets(q string, s string) string {
 	maxsnippets := 4
 	re, err := re(q)
 	// If the compilation didn't work, truncate and return
-	if err != nil || len(s) <= snippetlen {
+	if err != nil {
 		if len(s) > 400 {
 			s = s[0:400] + " …"
 		}
 		return s
 	}
+	// Short cut for short pages
+	if len(s) <= snippetlen {
+		return highlight(q, re, s)
+	}
 	// show a snippet from the beginning of the document
 	j := strings.LastIndex(s[:snippetlen], " ")
 	if j == -1 {
--- a/tokenizer.go
+++ b/tokenizer.go
@@ -0,0 +1,56 @@
+package main
+
+import (
+	"bytes"
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
+
+// tokenize returns a slice of alphanumeric tokens for the given text.
+func tokenize(text string) []string {
+	return strings.FieldsFunc(text, func(r rune) bool {
+		return !unicode.IsLetter(r) && !unicode.IsNumber(r)
+	})
+}
+
+// lowercaseFilter returns a slice of lower case tokens.
+func lowercaseFilter(tokens []string) []string {
+	r := make([]string, len(tokens))
+	for i, token := range tokens {
+		r[i] = strings.ToLower(token)
+	}
+	return r
+}
+
+// tokens returns a slice of alphanumeric tokens.
+func tokens(text string) []string {
+	tokens := tokenize(text)
+	tokens = lowercaseFilter(tokens)
+	return tokens
+}
+
+// hashtags returns a slice of hashtags.
+func hashtags(s []byte) []string {
+	hashtags := make([]string, 0)
+	for {
+		i := bytes.IndexRune(s, '#')
+		if i == -1 {
+			return hashtags
+		}
+		from := i
+		i++
+		for {
+			r, n := utf8.DecodeRune(s[i:])
+			if n > 0 && (unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_') {
+				i += n
+			} else {
+				break
+			}
+		}
+		if i > from+1 { // not just "#"
+			hashtags = append(hashtags, string(bytes.ToLower(s[from:i])))
+		}
+		s = s[i:]
+	}
+}
--- a/tokenizer_test.go
+++ b/tokenizer_test.go
@@ -0,0 +1,13 @@
+package main
+
+import (
+	"github.com/stretchr/testify/assert"
+	"testing"
+)
+
+func TestTokenizer(t *testing.T) {
+	assert.EqualValues(t, []string{}, tokens(""), "empty string")
+	assert.EqualValues(t, []string{"franc"}, tokens("Franc"), "lower case")
+	assert.EqualValues(t, []string{"i", "don", "t", "know", "what", "to", "do"}, tokens("I don't know what to do."))
+	assert.EqualValues(t, []string{"#truth"}, hashtags([]byte("This is boring. #Truth")), "hashtags")
+}
--- a/view.go
+++ b/view.go
@@ -3,8 +3,8 @@ package main
 import (
 	"net/http"
 	"os"
-	"time"
 	"strings"
+	"time"
 )

 // rootHandler just redirects to /view/index.
@@ -31,7 +31,7 @@ func viewHandler(w http.ResponseWriter, r *http.Request, name string) {
 		file = false
 		if strings.HasSuffix(fn, ".rss") {
 			rss = true
-			name = fn[0:len(fn)-4]
+			name = fn[0 : len(fn)-4]
 			fn = name
 		}
 		fn += ".md"
--- a/view_test.go
+++ b/view_test.go
@@ -88,14 +88,13 @@ I like spring better
 	assert.NoError(t, err)
 	h := makeHandler(viewHandler, true)
 	assert.Equal(t, []string{fi.ModTime().UTC().Format(http.TimeFormat)},
-	HTTPHeaders(h, "GET", "/view/testdata/now", nil, "Last-Modified"))
+		HTTPHeaders(h, "GET", "/view/testdata/now", nil, "Last-Modified"))
 	HTTPStatusCodeIfModifiedSince(t, h, "/view/testdata/now", fi.ModTime())
 	t.Cleanup(func() {
 		_ = os.RemoveAll("testdata")
 	})
 }

-
 // wipes testdata
 func TestPageHead(t *testing.T) {
 	_ = os.RemoveAll("testdata")
--- a/wiki.go
+++ b/wiki.go
@@ -3,9 +3,9 @@ package main
 import (
 	"context"
 	"flag"
-	"fmt"
 	"github.com/google/subcommands"
 	"html/template"
+	"log"
 	"net/http"
 	"os"
 	"regexp"
@@ -69,12 +69,12 @@ func getPort() string {
 // and after. For testing, call index.load directly and skip the
 // messages.
 func scheduleLoadIndex() {
-	fmt.Print("Indexing pages\n")
+	log.Print("Indexing pages")
 	n, err := index.load()
 	if err == nil {
-		fmt.Printf("Indexed %d pages\n", n)
+		log.Printf("Indexed %d pages", n)
 	} else {
-		fmt.Println("Indexing failed")
+		log.Printf("Indexing failed: %s", err)
 	}
 }

@@ -82,9 +82,9 @@ func scheduleLoadIndex() {
 // and after. For testing, call loadLanguages directly and skip the
 // messages.
 func scheduleLoadLanguages() {
-	fmt.Print("Loading languages\n")
+	log.Print("Loading languages")
 	n := loadLanguages()
-	fmt.Printf("Loaded %d languages\n", n)
+	log.Printf("Loaded %d languages", n)
 }

 func serve() {
@@ -101,7 +101,7 @@ func serve() {
 	go scheduleLoadLanguages()
 	initAccounts()
 	port := getPort()
-	fmt.Printf("Serving a wiki on port %s\n", port)
+	log.Printf("Serving a wiki on port %s", port)
 	http.ListenAndServe(":"+port, nil)
 }

--- a/wiki_test.go
+++ b/wiki_test.go
@@ -66,6 +66,7 @@ func HTTPUploadAndRedirectTo(t *testing.T, handler http.HandlerFunc, url, conten
 		"Expected HTTP redirect location %s for %q but received %v", destination, url, headers)
 	return isRedirectCode
 }
+
 // HTTPStatusCodeIfModifiedSince checks that the request results in a
 // 304 response for the given time.
 func HTTPStatusCodeIfModifiedSince(t *testing.T, handler http.HandlerFunc, url string, ti time.Time) {
Author	SHA1	Message	Date
Alex Schroeder	2188f99dea	go fmt	2023-09-26 00:07:34 +02:00
Alex Schroeder	c063174063	Allow b elements in snippets	2023-09-25 17:20:35 +02:00
Alex Schroeder	e1258da63b	Index hashtags and grep the rest	2023-09-25 16:31:33 +02:00
Alex Schroeder	8eb700fb0b	Use full text search	2023-09-25 14:07:00 +02:00
Alex Schroeder	7514c2173b	Add exact serch from the command line	2023-09-25 09:21:13 +02:00