borked the shitty lexer, tests, new entities

2021-03-04 11:42:31 +01:00
parent e6c5fca61e
commit 70adeedf39
3 changed files with 235 additions and 121 deletions
--- a/main.go
+++ b/main.go
@ -4,128 +4,8 @@ import (
 	"bufio"
 	"fmt"
 	"os"
 	"strings"
 )
 type Token struct {
 	Type  string
 	Value string
 }
 func (t Token) String() string {
 	return fmt.Sprintf("%s : '%s'", t.Type, t.Value)
 }
 // Lexer embbed a temporary buffer to store "content"
 // (url, description, tags) and an array of tokens
 type Lexer struct {
 	Buf    string
 	Tokens []Token
 }
 func (l *Lexer) AddToken(t string, s string) {
 	l.Tokens = append(l.Tokens, Token{t, s})
 }
 // Return last tokenized item, useful to determine context for a symbol
 func (l Lexer) LastToken() Token {
 	var lastToken Token
 	if len(l.Tokens) > 0 {
 		lastToken = l.Tokens[len(l.Tokens)-1]
 	}
 	return lastToken
 }
 // Tokenize a given line s from the org file
 func (l Lexer) Process(s string) []Token {
 	for i := range s {
 		char := string(s[i])
 		switch char {
 		case "*":
 			l.AddToken("HEADER", char)
 		case "[":
 			l.AddToken("OBRACKET", char) // this doesnt
 		case "]":
 			// non-empty buffer and closing bracket means current state is out of "content" context and buffer can be tokenized
 			if len(l.Buf) > 0 {
 				l.AddToken("CONTENT", l.Buf)
 				l.Buf = ""
 				l.AddToken("CBRACKET", char)
 			}
 		// whitespaces have different meaning given the context : Either separator or part of a content string
 		case " ":
 			lt := l.LastToken()
 			if len(l.Buf) > 0 && lt.Type != "OBRACKET" {
 				l.AddToken("CONTENT", l.Buf)
 				l.Buf = ""
 				l.AddToken("WHITESPACE", char)
 			}
 			if i > 0 {
 				if string(s[i-1]) == " " {
 					break
 				}
 			}
 			if lt.Type != "WHITESPACE" {
 				if len(l.Buf) == 0 {
 					l.AddToken("WHITESPACE", char)
 				} else {
 					l.Buf += char
 				}
 			}
 		default:
 			l.Buf += char
 		}
 	}
 	if len(l.Buf) > 0 {
 		l.AddToken("CONTENT", l.Buf)
 	}
 	return l.Tokens
 }
 // Only retrieve content tokens, ignores uneeded separators and brackets
 func Parse(t []Token) []string {
 	var content []string
 	for i := range t {
 		token := t[i]
 		if token.Type == "CONTENT" {
 			content = append(content, token.Value)
 		}
 	}
 	return content
 }
 // Return the final feed string, depending on either the link has a description, tags or not
 func FormatFeed(content []string) string {
 	var feed string
 	if len(content) > 1 {
 		url := content[0]
 		feed = fmt.Sprintf("%s\n", url)
 	}
 	if len(content) > 2 {
 		url := content[0]
 		tag := strings.ReplaceAll(content[2], ":", "")
 		comment := content[1]
 		feed = fmt.Sprintf("%s %s # %s\n", url, tag, comment)
 	} else {
 		url := content[0]
 		tag := strings.ReplaceAll(content[1], ":", "")
 		feed = fmt.Sprintf("%s %s\n", url, tag)
 	}
 	return feed
 }
 func IsExistFile(path string) error {
 	if _, err := os.Stat(path); os.IsNotExist(err) {
 		message := fmt.Sprintf("File does not exist : %s", err)
@ -170,7 +50,7 @@ func main() {
 		}
 		tokens := lexer.Process(scanner.Text())
-		feed := FormatFeed(Parse(tokens))
+		feed := Parse(tokens).String()
 		file.WriteString(feed)
 	}
--- a/main_test.go
+++ b/main_test.go
@ -0,0 +1,57 @@
 package main
 import (
 	"fmt"
 	"testing"
 )
 func LexerTestWrapper(message string, expected string) string {
 	lexer := Lexer{}
 	return Parse(lexer.Process(message)).String()
 }
 func LexerTestWrapperFail(expected string, result string) {
 	fmt.Println("Expected :", expected)
 	fmt.Println("Got : ", result)
 }
 func TestLinkNoTag(t *testing.T) {
 	var message string = "** https://pleroma.social/announcements/feed.xml"
 	var expected string = "https://pleroma.social/announcements/feed.xml"
 	result := LexerTestWrapper(message, expected)
 	if result != expected {
 		LexerTestWrapperFail(expected, result)
 		t.Fail()
 	}
 }
 func TestLinkTag(t *testing.T) {
 	var message string = "** https://pleroma.social/announcements/feed.xml               :software:"
 	var expected string = "https://pleroma.social/announcements/feed.xml software"
 	result := LexerTestWrapper(message, expected)
 	if result != expected {
 		LexerTestWrapperFail(expected, result)
 		t.Fail()
 	}
 }
 func TestLinkDescTag(t *testing.T) {
 	var message string = "** [[https://pleroma.social/announcements/feed.xml][Pleroma Social]] :software:"
 	var expected string = "https://pleroma.social/announcements/feed.xml # Pleroma Social"
 	var result string = LexerTestWrapper(message, expected)
 	if result != expected {
 		LexerTestWrapperFail(expected, result)
 		t.Fail()
 	}
 }
 func TestLinkDescNoTag(t *testing.T) {
 	var message string = "** [[https://pleroma.social/announcements/feed.xml][Pleroma Social]]"
 	var expected string = "https://pleroma.social/announcements/feed.xml # Pleroma Social"
 	var result string = LexerTestWrapper(message, expected)
 	if result != expected {
 		LexerTestWrapperFail(expected, result)
 		t.Fail()
 	}
 }
--- a/tokenizer.go
+++ b/tokenizer.go
@ -0,0 +1,177 @@
 package main
 import (
 	"fmt"
 	"strings"
 )
 type Feed struct {
 	URL string
 	Description string
 	Tags []string
 }
 // Return the final feed string, depending on either the link has a description, tags or not
 func (f Feed) String() string {
 	var ff string
 	var tags string
 	if len(f.Tags) > 0 {
 		for i := range f.Tags {
 			tags += " " +  f.Tags[i]
 		}
 	}
 	if f.Description == "" {
 		ff = fmt.Sprintf("%s %s", f.URL, tags)
 	} else {
 		ff = fmt.Sprintf("%s %s # %s", f.URL, f.Description, tags)
 	}
 	return strings.TrimSpace(ff)
 }
 type Token struct {
 	Type  string
 	Value string
 }
 func (t Token) String() string {
 	return fmt.Sprintf("%s : '%s'", t.Type, t.Value)
 }
 // Lexer embbed a temporary buffer to store "content"
 // (url, description, tags) and an array of tokens
 type Lexer struct {
 	Buf    string
 	Tokens []Token
 }
 func (l *Lexer) AddToken(t string, s string) {
 	l.Tokens = append(l.Tokens, Token{t, s})
 }
 // Return last tokenized item, useful to determine context for a symbol
 func (l Lexer) LastToken() Token {
 	var lastToken Token
 	if len(l.Tokens) > 0 {
 		lastToken = l.Tokens[len(l.Tokens)-1]
 	}
 	return lastToken
 }
 func (l Lexer) CountToken(t string) int {
 	var counter int
 	for i := range l.Tokens {
 		if l.Tokens[i].Value == t {
 			counter ++
 		}
 	}
 	return counter
 }
 func (l Lexer) IdentifyContent() string {
 	var tokenType string
 	if l.CountToken("[") == 2 {
 		tokenType = "URL"
 	}
 	if l.CountToken("[") == 3 {
 		tokenType = "DESC"
 	} else {
 	}
 	return tokenType
 }
 // Tokenize a given line s from the org file
 func (l Lexer) Process(s string) []Token {
 	if strings.Count(s, "[") <= 0 {
 		ss := strings.Split(s, " ")
 		fmt.Println(ss)
 		for w := range ss {
 			if w == 0 {
 				l.AddToken("URL", ss[w])
 			} else {
 				l.AddToken("TAG", ss[w])
 			}
 		}
 		return l.Tokens
 	}
 	for i := range s {
 		char := string(s[i])
 		switch char {
 		case "*":
 			l.AddToken("HEADER", char)
 		case "[":
 			l.AddToken("OBRACKET", char)
 		case "]":
 			// non-empty buffer and closing bracket means
 			// current state is out of "content" context and buffer can be tokenized
 			if len(l.Buf) > 0 {
 				var tokenType string = l.IdentifyContent()
 				l.AddToken("CBRACKET", char)
 				l.AddToken(tokenType, l.Buf)
 				l.Buf = ""
 			}
 			// whitespaces have different meaning given the context :
 			// Either separator or part of a content string
 		case " ":
 			lt := l.LastToken()
 			if len(l.Buf) > 0 {
 				l.AddToken(l.IdentifyContent(), l.Buf)
 				l.Buf = ""
 				l.AddToken("WHITESPACE", char)
 			}
 			if i > 0 {
 				if string(s[i-1]) == " " {
 					break
 				}
 			}
 			if lt.Type != "WHITESPACE" {
 				if len(l.Buf) == 0 {
 					l.AddToken("WHITESPACE", char)
 				} else {
 					l.Buf += char
 				}
 			}
 		default:
 			l.Buf += char
 		}
 	}
 	if len(l.Buf) > 0 {
 		l.AddToken(l.IdentifyContent(), l.Buf)
 	}
 	return l.Tokens
 }
 // Only retrieve content tokens, ignores uneeded separators and brackets
 func Parse(t []Token) Feed {
 	var f Feed
 	for i := range t {
 		token := t[i]
 		if token.Type == "URL" {
 			f.URL = token.Value
 		}
 		if token.Type == "DESC" {
 			f.Description = token.Value
 		}
 		if token.Type == "TAG" {
 			f.Tags = append(f.Tags, token.Value)
 		}
 	}
 	return f
 }