From f2347b8801705468acb214adfdab428f2ef2b8e3 Mon Sep 17 00:00:00 2001 From: adminoo Date: Thu, 4 Mar 2021 17:04:17 +0100 Subject: [PATCH] regex after all heh --- main_test.go | 8 ++-- tokenizer.go | 115 +++++++++++++++------------------------------------ 2 files changed, 38 insertions(+), 85 deletions(-) diff --git a/main_test.go b/main_test.go index 752bb4b..db0f685 100644 --- a/main_test.go +++ b/main_test.go @@ -26,8 +26,8 @@ func TestLinkNoTag(t *testing.T) { } func TestLinkTag(t *testing.T) { - var message string = "** https://pleroma.social/announcements/feed.xml :software:" - var expected string = "https://pleroma.social/announcements/feed.xml software" + var message string = "** https://pleroma.social/announcements/feed.xml :software: :social:" + var expected string = "https://pleroma.social/announcements/feed.xml software social" result := LexerTestWrapper(message, expected) if result != expected { LexerTestWrapperFail(expected, result) @@ -36,8 +36,8 @@ func TestLinkTag(t *testing.T) { } func TestLinkDescTag(t *testing.T) { - var message string = "** [[https://pleroma.social/announcements/feed.xml][Pleroma Social]] :software:" - var expected string = "https://pleroma.social/announcements/feed.xml # Pleroma Social" + var message string = "** [[https://pleroma.social/announcements/feed.xml][Pleroma Social]] :software:" + var expected string = "https://pleroma.social/announcements/feed.xml software # Pleroma Social" var result string = LexerTestWrapper(message, expected) if result != expected { LexerTestWrapperFail(expected, result) diff --git a/tokenizer.go b/tokenizer.go index efa1b92..d26a5e4 100644 --- a/tokenizer.go +++ b/tokenizer.go @@ -2,6 +2,7 @@ package main import ( "fmt" + "regexp" "strings" ) @@ -14,18 +15,12 @@ type Feed struct { // Return the final feed string, depending on either the link has a description, tags or not func (f Feed) String() string { var ff string - var tags string + var tags string = strings.TrimSpace(strings.Join(f.Tags, " ")) - if len(f.Tags) > 0 { - for i := range f.Tags { - tags += " " + f.Tags[i] - } - } + ff = fmt.Sprintf("%s %s", f.URL, tags) - if f.Description == "" { - ff = fmt.Sprintf("%s %s", f.URL, tags) - } else { - ff = fmt.Sprintf("%s %s # %s", f.URL, f.Description, tags) + if f.Description != "" { + ff = fmt.Sprintf("%s # %s", strings.TrimSpace(ff), f.Description) } return strings.TrimSpace(ff) @@ -71,91 +66,49 @@ func (l Lexer) CountToken(t string) int { return counter } -func (l Lexer) IdentifyContent() string { - var tokenType string - if l.CountToken("[") == 2 { - tokenType = "URL" +func (l *Lexer) ProcessSimpleLink(s string) { + ss := strings.Split(s[3:], " ") + for w := range ss { + if w == 0 { + url := strings.TrimSpace(ss[w]) + l.AddToken("URL", url) + } else { + if ss[w] != "" && ss[w] != " " { + tag := strings.ReplaceAll(ss[w], ":", "") + l.AddToken("TAG", tag) + } + } } - if l.CountToken("[") == 3 { - tokenType = "DESC" - } else { - - } - return tokenType } // Tokenize a given line s from the org file func (l Lexer) Process(s string) []Token { - if strings.Count(s, "[") <= 0 { - ss := strings.Split(s, " ") - fmt.Println(ss) - for w := range ss { - if w == 0 { - l.AddToken("URL", ss[w]) - } else { - l.AddToken("TAG", ss[w]) - } - } - + if s[3] != '[' { + l.ProcessSimpleLink(s) return l.Tokens } - - for i := range s { - char := string(s[i]) - switch char { - case "*": - l.AddToken("HEADER", char) - case "[": - l.AddToken("OBRACKET", char) - - case "]": - // non-empty buffer and closing bracket means - // current state is out of "content" context and buffer can be tokenized - if len(l.Buf) > 0 { - var tokenType string = l.IdentifyContent() - l.AddToken("CBRACKET", char) - l.AddToken(tokenType, l.Buf) - l.Buf = "" + re := regexp.MustCompile(`(?:\[\[)(?P\S+)(?:\]\[)(?P.+)(?:\]\])(?P.+)?`) + matches := re.FindStringSubmatch(s) + if len(matches) > 1 { + l.AddToken("URL", strings.TrimSpace(matches[1])) + } + if len(matches) > 2 { + l.AddToken("DESC", strings.TrimSpace(matches[2])) + } + if len(matches) > 3 { + tags := strings.Split(matches[3], " ") + for t := range tags { + if tags[t] != "" && tags[t] != " " { + tag := strings.ReplaceAll(tags[t], ":", "") + l.AddToken("TAG", strings.TrimSpace(tag)) } - - // whitespaces have different meaning given the context : - // Either separator or part of a content string - case " ": - lt := l.LastToken() - if len(l.Buf) > 0 { - l.AddToken(l.IdentifyContent(), l.Buf) - l.Buf = "" - l.AddToken("WHITESPACE", char) - } - - if i > 0 { - if string(s[i-1]) == " " { - break - } - } - - if lt.Type != "WHITESPACE" { - if len(l.Buf) == 0 { - l.AddToken("WHITESPACE", char) - } else { - l.Buf += char - } - } - - default: - l.Buf += char } } - - if len(l.Buf) > 0 { - l.AddToken(l.IdentifyContent(), l.Buf) - } - + return l.Tokens } -// Only retrieve content tokens, ignores uneeded separators and brackets func Parse(t []Token) Feed { var f Feed for i := range t {