borked the shitty lexer, tests, new entities

2021-03-04 11:42:31 +01:00
parent e6c5fca61e
commit 70adeedf39
3 changed files with 235 additions and 121 deletions
--- a/main.go
+++ b/main.go
@ -4,128 +4,8 @@ import (
 	"bufio"
 	"fmt"
 	"os"
-	"strings"
 )

-type Token struct {
-	Type  string
-	Value string
-}
-
-func (t Token) String() string {
-	return fmt.Sprintf("%s : '%s'", t.Type, t.Value)
-}
-
-// Lexer embbed a temporary buffer to store "content"
-// (url, description, tags) and an array of tokens
-type Lexer struct {
-	Buf    string
-	Tokens []Token
-}
-
-func (l *Lexer) AddToken(t string, s string) {
-	l.Tokens = append(l.Tokens, Token{t, s})
-}
-
-// Return last tokenized item, useful to determine context for a symbol
-func (l Lexer) LastToken() Token {
-	var lastToken Token
-	if len(l.Tokens) > 0 {
-		lastToken = l.Tokens[len(l.Tokens)-1]
-	}
-	return lastToken
-}
-
-// Tokenize a given line s from the org file
-func (l Lexer) Process(s string) []Token {
-	for i := range s {
-		char := string(s[i])
-		switch char {
-		case "*":
-			l.AddToken("HEADER", char)
-
-		case "[":
-			l.AddToken("OBRACKET", char) // this doesnt
-
-		case "]":
-			// non-empty buffer and closing bracket means current state is out of "content" context and buffer can be tokenized
-			if len(l.Buf) > 0 {
-				l.AddToken("CONTENT", l.Buf)
-				l.Buf = ""
-				l.AddToken("CBRACKET", char)
-			}
-
-		// whitespaces have different meaning given the context : Either separator or part of a content string
-		case " ":
-			lt := l.LastToken()
-			if len(l.Buf) > 0 && lt.Type != "OBRACKET" {
-				l.AddToken("CONTENT", l.Buf)
-				l.Buf = ""
-				l.AddToken("WHITESPACE", char)
-			}
-
-			if i > 0 {
-				if string(s[i-1]) == " " {
-					break
-				}
-			}
-
-			if lt.Type != "WHITESPACE" {
-				if len(l.Buf) == 0 {
-					l.AddToken("WHITESPACE", char)
-				} else {
-					l.Buf += char
-				}
-			}
-
-		default:
-			l.Buf += char
-		}
-	}
-
-	if len(l.Buf) > 0 {
-		l.AddToken("CONTENT", l.Buf)
-	}
-
-	return l.Tokens
-}
-
-// Only retrieve content tokens, ignores uneeded separators and brackets
-func Parse(t []Token) []string {
-	var content []string
-	for i := range t {
-		token := t[i]
-		if token.Type == "CONTENT" {
-			content = append(content, token.Value)
-		}
-	}
-
-	return content
-}
-
-// Return the final feed string, depending on either the link has a description, tags or not
-func FormatFeed(content []string) string {
-	var feed string
-
-	if len(content) > 1 {
-		url := content[0]
-		feed = fmt.Sprintf("%s\n", url)
-	}
-
-	if len(content) > 2 {
-		url := content[0]
-		tag := strings.ReplaceAll(content[2], ":", "")
-		comment := content[1]
-		feed = fmt.Sprintf("%s %s # %s\n", url, tag, comment)
-	} else {
-		url := content[0]
-		tag := strings.ReplaceAll(content[1], ":", "")
-		feed = fmt.Sprintf("%s %s\n", url, tag)
-	}
-
-	return feed
-}
-
 func IsExistFile(path string) error {
 	if _, err := os.Stat(path); os.IsNotExist(err) {
 		message := fmt.Sprintf("File does not exist : %s", err)
@ -170,7 +50,7 @@ func main() {
 		}

 		tokens := lexer.Process(scanner.Text())
-		feed := FormatFeed(Parse(tokens))
+		feed := Parse(tokens).String()
 		file.WriteString(feed)
 	}

--- a/main_test.go
+++ b/main_test.go
@ -0,0 +1,57 @@
+package main
+
+import (
+	"fmt"
+	"testing"
+)
+
+func LexerTestWrapper(message string, expected string) string {
+	lexer := Lexer{}
+	return Parse(lexer.Process(message)).String()
+}
+
+func LexerTestWrapperFail(expected string, result string) {
+	fmt.Println("Expected :", expected)
+	fmt.Println("Got : ", result)
+}
+
+func TestLinkNoTag(t *testing.T) {
+	var message string = "** https://pleroma.social/announcements/feed.xml"
+	var expected string = "https://pleroma.social/announcements/feed.xml"
+	result := LexerTestWrapper(message, expected)
+	if result != expected {
+		LexerTestWrapperFail(expected, result)
+		t.Fail()
+	}
+}
+
+func TestLinkTag(t *testing.T) {
+	var message string = "** https://pleroma.social/announcements/feed.xml               :software:"
+	var expected string = "https://pleroma.social/announcements/feed.xml software"
+	result := LexerTestWrapper(message, expected)
+	if result != expected {
+		LexerTestWrapperFail(expected, result)
+		t.Fail()
+	}
+}
+
+func TestLinkDescTag(t *testing.T) {
+	var message string = "** [[https://pleroma.social/announcements/feed.xml][Pleroma Social]] :software:"
+	var expected string = "https://pleroma.social/announcements/feed.xml # Pleroma Social"
+	var result string = LexerTestWrapper(message, expected)
+	if result != expected {
+		LexerTestWrapperFail(expected, result)
+		t.Fail()
+	}
+}
+
+
+func TestLinkDescNoTag(t *testing.T) {
+	var message string = "** [[https://pleroma.social/announcements/feed.xml][Pleroma Social]]"
+	var expected string = "https://pleroma.social/announcements/feed.xml # Pleroma Social"
+	var result string = LexerTestWrapper(message, expected)
+	if result != expected {
+		LexerTestWrapperFail(expected, result)
+		t.Fail()
+	}
+}
--- a/tokenizer.go
+++ b/tokenizer.go
@ -0,0 +1,177 @@
+package main
+
+import (
+	"fmt"
+	"strings"
+)
+
+type Feed struct {
+	URL string
+	Description string
+	Tags []string
+}
+
+// Return the final feed string, depending on either the link has a description, tags or not
+func (f Feed) String() string {
+	var ff string
+	var tags string
+
+	if len(f.Tags) > 0 {
+		for i := range f.Tags {
+			tags += " " +  f.Tags[i]
+		}
+	}
+
+	if f.Description == "" {
+		ff = fmt.Sprintf("%s %s", f.URL, tags)
+	} else {
+		ff = fmt.Sprintf("%s %s # %s", f.URL, f.Description, tags)
+	}
+	
+	return strings.TrimSpace(ff)
+}
+
+
+type Token struct {
+	Type  string
+	Value string
+}
+
+func (t Token) String() string {
+	return fmt.Sprintf("%s : '%s'", t.Type, t.Value)
+}
+
+// Lexer embbed a temporary buffer to store "content"
+// (url, description, tags) and an array of tokens
+type Lexer struct {
+	Buf    string
+	Tokens []Token
+}
+
+func (l *Lexer) AddToken(t string, s string) {
+	l.Tokens = append(l.Tokens, Token{t, s})
+}
+
+// Return last tokenized item, useful to determine context for a symbol
+func (l Lexer) LastToken() Token {
+	var lastToken Token
+	if len(l.Tokens) > 0 {
+		lastToken = l.Tokens[len(l.Tokens)-1]
+	}
+	return lastToken
+}
+
+func (l Lexer) CountToken(t string) int {
+	var counter int
+	for i := range l.Tokens {
+		if l.Tokens[i].Value == t {
+			counter ++
+		}
+	}
+	return counter
+}
+
+func (l Lexer) IdentifyContent() string {
+	var tokenType string
+	if l.CountToken("[") == 2 {
+		tokenType = "URL"
+	}
+	if l.CountToken("[") == 3 {
+		tokenType = "DESC"
+	} else {
+		
+	}
+	return tokenType
+}
+
+// Tokenize a given line s from the org file
+func (l Lexer) Process(s string) []Token {
+	if strings.Count(s, "[") <= 0 {
+		ss := strings.Split(s, " ")
+		fmt.Println(ss)
+		for w := range ss {
+			if w == 0 {
+				l.AddToken("URL", ss[w])
+			} else {
+				l.AddToken("TAG", ss[w])
+			}
+		}
+		
+		return l.Tokens
+	}
+	
+	for i := range s {
+		char := string(s[i])
+		switch char {
+		case "*":
+			l.AddToken("HEADER", char)
+
+		case "[":
+			l.AddToken("OBRACKET", char)
+
+		case "]":
+			// non-empty buffer and closing bracket means
+			// current state is out of "content" context and buffer can be tokenized
+			if len(l.Buf) > 0 {
+				var tokenType string = l.IdentifyContent()
+				l.AddToken("CBRACKET", char)
+				l.AddToken(tokenType, l.Buf)
+				l.Buf = ""
+			}
+
+			// whitespaces have different meaning given the context :
+			// Either separator or part of a content string
+		case " ":
+			lt := l.LastToken()
+			if len(l.Buf) > 0 {
+				l.AddToken(l.IdentifyContent(), l.Buf)
+				l.Buf = ""
+				l.AddToken("WHITESPACE", char)
+			}
+
+			if i > 0 {
+				if string(s[i-1]) == " " {
+					break
+				}
+			}
+
+			if lt.Type != "WHITESPACE" {
+				if len(l.Buf) == 0 {
+					l.AddToken("WHITESPACE", char)
+				} else {
+					l.Buf += char
+				}
+			}
+
+		default:
+			l.Buf += char
+		}
+	}
+
+	if len(l.Buf) > 0 {
+		l.AddToken(l.IdentifyContent(), l.Buf)
+	}
+
+	return l.Tokens
+}
+
+// Only retrieve content tokens, ignores uneeded separators and brackets
+func Parse(t []Token) Feed {
+	var f Feed
+	for i := range t {
+		token := t[i]
+		if token.Type == "URL" {
+			f.URL = token.Value
+		}
+		
+		if token.Type == "DESC" {
+			f.Description = token.Value
+		}
+		
+		if token.Type == "TAG" {
+			f.Tags = append(f.Tags, token.Value)
+		}
+	}
+
+	return f
+}