Files
oddmu/tokenizer.go
Alex Schroeder 541725ecd7 Query string tokens include punctuation
Drop the tokenize function since there are now dedicated tokenizers
for query string and hashtag finding.
2023-09-27 08:24:33 +02:00

96 lines
2.4 KiB
Go

package main
import (
"bytes"
"strings"
"unicode"
"unicode/utf8"
)
// lowercaseFilter returns a slice of lower case tokens.
func lowercaseFilter(tokens []string) []string {
r := make([]string, len(tokens))
for i, token := range tokens {
r[i] = strings.ToLower(token)
}
return r
}
// tokenizeWithPredicates returns a slice of tokens for the given
// text, including punctuation. Use this to begin tokenizing the query
// string.
func tokenizeOnWhitespace(q string) []string {
return strings.Fields(q)
}
// predicateFilter returns two slices of tokens: the first with
// predicates, the other without predicates. Use this for query
// string tokens.
func predicateFilter(tokens []string) ([]string, []string) {
with := make([]string, 0)
without := make([]string, 0)
for _, token := range tokens {
if strings.Contains(token, ":") {
with = append(with, token)
} else {
without = append(without, token)
}
}
return with, without
}
// predicatesAndTokens returns two slices of tokens: the first with
// predicates, the other without predicates, all of them lower case.
// Use this for query strings.
func predicatesAndTokens(q string) ([]string, []string) {
tokens := tokenizeOnWhitespace(q)
tokens = lowercaseFilter(tokens)
return predicateFilter(tokens)
}
// noPredicateFilter returns a slice of tokens: the predicates without
// the predicate, and all the others. That is: "foo:bar baz" is turned
// into ["bar", "baz"] and the predicate "foo:" is dropped.
func noPredicateFilter(tokens []string) []string {
r := make([]string, 0)
for _, token := range tokens {
parts := strings.Split(token, ":")
r = append(r, parts[len(parts)-1])
}
return r
}
// highlightTokens returns the tokens to highlight, including title
// predicates.
func highlightTokens(q string) []string {
tokens := tokenizeOnWhitespace(q)
tokens = lowercaseFilter(tokens)
return noPredicateFilter(tokens)
}
// hashtags returns a slice of hashtags. Use this to extract hashtags
// from a page body.
func hashtags(s []byte) []string {
hashtags := make([]string, 0)
for {
i := bytes.IndexRune(s, '#')
if i == -1 {
return hashtags
}
from := i
i++
for {
r, n := utf8.DecodeRune(s[i:])
if n > 0 && (unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_') {
i += n
} else {
break
}
}
if i > from+1 { // not just "#"
hashtags = append(hashtags, string(bytes.ToLower(s[from:i])))
}
s = s[i:]
}
}