forked from mirror/oddmu
Drop the tokenize function since there are now dedicated tokenizers for query string and hashtag finding.
96 lines
2.4 KiB
Go
96 lines
2.4 KiB
Go
package main
|
|
|
|
import (
|
|
"bytes"
|
|
"strings"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// lowercaseFilter returns a slice of lower case tokens.
|
|
func lowercaseFilter(tokens []string) []string {
|
|
r := make([]string, len(tokens))
|
|
for i, token := range tokens {
|
|
r[i] = strings.ToLower(token)
|
|
}
|
|
return r
|
|
}
|
|
|
|
// tokenizeWithPredicates returns a slice of tokens for the given
|
|
// text, including punctuation. Use this to begin tokenizing the query
|
|
// string.
|
|
func tokenizeOnWhitespace(q string) []string {
|
|
return strings.Fields(q)
|
|
}
|
|
|
|
// predicateFilter returns two slices of tokens: the first with
|
|
// predicates, the other without predicates. Use this for query
|
|
// string tokens.
|
|
func predicateFilter(tokens []string) ([]string, []string) {
|
|
with := make([]string, 0)
|
|
without := make([]string, 0)
|
|
for _, token := range tokens {
|
|
if strings.Contains(token, ":") {
|
|
with = append(with, token)
|
|
} else {
|
|
without = append(without, token)
|
|
}
|
|
}
|
|
return with, without
|
|
}
|
|
|
|
// predicatesAndTokens returns two slices of tokens: the first with
|
|
// predicates, the other without predicates, all of them lower case.
|
|
// Use this for query strings.
|
|
func predicatesAndTokens(q string) ([]string, []string) {
|
|
tokens := tokenizeOnWhitespace(q)
|
|
tokens = lowercaseFilter(tokens)
|
|
return predicateFilter(tokens)
|
|
}
|
|
|
|
// noPredicateFilter returns a slice of tokens: the predicates without
|
|
// the predicate, and all the others. That is: "foo:bar baz" is turned
|
|
// into ["bar", "baz"] and the predicate "foo:" is dropped.
|
|
func noPredicateFilter(tokens []string) []string {
|
|
r := make([]string, 0)
|
|
for _, token := range tokens {
|
|
parts := strings.Split(token, ":")
|
|
r = append(r, parts[len(parts)-1])
|
|
}
|
|
return r
|
|
}
|
|
|
|
// highlightTokens returns the tokens to highlight, including title
|
|
// predicates.
|
|
func highlightTokens(q string) []string {
|
|
tokens := tokenizeOnWhitespace(q)
|
|
tokens = lowercaseFilter(tokens)
|
|
return noPredicateFilter(tokens)
|
|
}
|
|
|
|
// hashtags returns a slice of hashtags. Use this to extract hashtags
|
|
// from a page body.
|
|
func hashtags(s []byte) []string {
|
|
hashtags := make([]string, 0)
|
|
for {
|
|
i := bytes.IndexRune(s, '#')
|
|
if i == -1 {
|
|
return hashtags
|
|
}
|
|
from := i
|
|
i++
|
|
for {
|
|
r, n := utf8.DecodeRune(s[i:])
|
|
if n > 0 && (unicode.IsLetter(r) || unicode.IsNumber(r) || r == '_') {
|
|
i += n
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
if i > from+1 { // not just "#"
|
|
hashtags = append(hashtags, string(bytes.ToLower(s[from:i])))
|
|
}
|
|
s = s[i:]
|
|
}
|
|
}
|