diff --git a/ast/ast.go b/ast/ast.go index b4ee4b0..51d5434 100644 --- a/ast/ast.go +++ b/ast/ast.go @@ -2,6 +2,7 @@ package ast import ( "bytes" + "fmt" "strings" "github.com/skx/monkey/token" @@ -561,6 +562,29 @@ func (sl *StringLiteral) TokenLiteral() string { return sl.Token.Literal } // String returns this object as a string. func (sl *StringLiteral) String() string { return sl.Token.Literal } +// RegexpLiteral holds a regular-expression. +type RegexpLiteral struct { + // Token is the token + Token token.Token + + // Value is the value of the regular expression. + Value string + + // Flags contains any flags associated with the regexp. + Flags string +} + +func (rl *RegexpLiteral) expressionNode() {} + +// TokenLiteral returns the literal token. +func (rl *RegexpLiteral) TokenLiteral() string { return rl.Token.Literal } + +// String returns this object as a string. +func (rl *RegexpLiteral) String() string { + + return (fmt.Sprintf("/%s/%s", rl.Value, rl.Flags)) +} + // BacktickLiteral holds details of a command to be executed type BacktickLiteral struct { // Token is the actual token diff --git a/evaluator/evaluator.go b/evaluator/evaluator.go index 5262cfc..216a137 100644 --- a/evaluator/evaluator.go +++ b/evaluator/evaluator.go @@ -142,6 +142,8 @@ func Eval(node ast.Node, env *object.Environment) object.Object { return &object.Array{Elements: elements} case *ast.StringLiteral: return &object.String{Value: node.Value} + case *ast.RegexpLiteral: + return &object.Regexp{Value: node.Value, Flags: node.Flags} case *ast.BacktickLiteral: return backTickOperation(node.Value) case *ast.IndexExpression: @@ -273,8 +275,14 @@ func evalInfixExpression(operator string, left, right object.Object) object.Obje return nativeBoolToBooleanObject(objectToNativeBoolean(left) && objectToNativeBoolean(right)) case operator == "||": return nativeBoolToBooleanObject(objectToNativeBoolean(left) || objectToNativeBoolean(right)) + case operator == "!~": + return notMatches(left, right) + case operator == "~=": + return matches(left, right) + case operator == "==": return nativeBoolToBooleanObject(left == right) + case operator == "!=": return nativeBoolToBooleanObject(left != right) case left.Type() == object.BOOLEAN_OBJ && right.Type() == object.BOOLEAN_OBJ: @@ -288,6 +296,63 @@ func evalInfixExpression(operator string, left, right object.Object) object.Obje } } +func matches(left, right object.Object) object.Object { + + str := left.Inspect() + + if right.Type() != object.REGEXP_OBJ { + return newError("regexp required for regexp-match, given %s", right.Type()) + } + + val := right.(*object.Regexp).Value + if right.(*object.Regexp).Flags != "" { + val = "(?" + right.(*object.Regexp).Flags + ")" + val + } + + // Compile the regular expression. + r, err := regexp.Compile(val) + + // Ensure it compiled + if err != nil { + return newError("error compiling regexp '%s': %s", right.Inspect(), err) + } + + // Test if it matched + if r.MatchString(str) { + return TRUE + } + + return FALSE +} + +func notMatches(left, right object.Object) object.Object { + str := left.Inspect() + + if right.Type() != object.REGEXP_OBJ { + return newError("regexp required for regexp-match, given %s", right.Type()) + } + + val := right.(*object.Regexp).Value + if right.(*object.Regexp).Flags != "" { + val = "(?" + right.(*object.Regexp).Flags + ")" + val + } + + // Compile the regular expression. + r, err := regexp.Compile(val) + + // Ensure it compiled + if err != nil { + return newError("error compiling regexp '%s': %s", right.Inspect(), err) + } + + // Test if it matched + if r.MatchString(str) { + return FALSE + } + + return TRUE +} + // boolean operations func evalBooleanInfixExpression(operator string, left, right object.Object) object.Object { // convert the bools to strings. @@ -1010,6 +1075,8 @@ func objectToNativeBoolean(o object.Object) bool { return obj.Value case *object.String: return obj.Value != "" + case *object.Regexp: + return obj.Value != "" case *object.Null: return false case *object.Integer: diff --git a/evaluator/stdlib_core.go b/evaluator/stdlib_core.go index f407fa5..269101e 100644 --- a/evaluator/stdlib_core.go +++ b/evaluator/stdlib_core.go @@ -510,6 +510,8 @@ func typeFun(args ...object.Object) object.Object { switch args[0].(type) { case *object.String: return &object.String{Value: "string"} + case *object.Regexp: + return &object.String{Value: "regexp"} case *object.Boolean: return &object.String{Value: "bool"} case *object.Builtin: diff --git a/lexer/lexer.go b/lexer/lexer.go index 99afb81..f981517 100644 --- a/lexer/lexer.go +++ b/lexer/lexer.go @@ -1,17 +1,28 @@ package lexer import ( + "fmt" "strings" "github.com/skx/monkey/token" ) -// Lexer used to be as lexer for monkey programming language. +// Lexer holds our object-state. type Lexer struct { - position int //current character position - readPosition int //next character position - ch rune //current character - characters []rune //rune slice of input string + // The current character position + position int + + // The next character position + readPosition int + + //The current character + ch rune + + // A rune slice of our input string + characters []rune + + // Previous token. + prevToken token.Token } // New a Lexer instance from string input. @@ -140,7 +151,34 @@ func (l *Lexer) NextToken() token.Token { l.readChar() tok = token.Token{Type: token.SLASH_EQUALS, Literal: string(ch) + string(l.ch)} } else { - tok = newToken(token.SLASH, l.ch) + // slash is mostly division, but could + // be the start of a regular expression + + // We exclude: + // a[b] / c -> RBRACKET + // ( a + b ) / c -> RPAREN + // a / c -> IDENT + // 3.2 / c -> FLOAT + // 1 / c -> IDENT + // + if l.prevToken.Type == token.RBRACKET || + l.prevToken.Type == token.RPAREN || + l.prevToken.Type == token.IDENT || + l.prevToken.Type == token.INT || + l.prevToken.Type == token.FLOAT { + + tok = newToken(token.SLASH, l.ch) + } else { + str, err := l.readRegexp() + if err == nil { + tok.Type = token.REGEXP + tok.Literal = str + } else { + fmt.Printf("%s\n", err.Error()) + tok.Type = token.REGEXP + tok.Literal = str + } + } } case rune('*'): if l.peekChar() == rune('*') { @@ -170,13 +208,27 @@ func (l *Lexer) NextToken() token.Token { } else { tok = newToken(token.GT, l.ch) } + case rune('~'): + if l.peekChar() == rune('=') { + ch := l.ch + l.readChar() + tok = token.Token{Type: token.CONTAINS, Literal: string(ch) + string(l.ch)} + } + case rune('!'): if l.peekChar() == rune('=') { ch := l.ch l.readChar() tok = token.Token{Type: token.NOT_EQ, Literal: string(ch) + string(l.ch)} } else { - tok = newToken(token.BANG, l.ch) + if l.peekChar() == rune('~') { + ch := l.ch + l.readChar() + tok = token.Token{Type: token.NOT_CONTAINS, Literal: string(ch) + string(l.ch)} + + } else { + tok = newToken(token.BANG, l.ch) + } } case rune('"'): tok.Type = token.STRING @@ -194,14 +246,21 @@ func (l *Lexer) NextToken() token.Token { tok.Literal = "" tok.Type = token.EOF default: + if isDigit(l.ch) { - return l.readDecimal() + tok := l.readDecimal() + l.prevToken = tok + return tok + } tok.Literal = l.readIdentifier() tok.Type = token.LookupIdentifier(tok.Literal) + l.prevToken = tok + return tok } l.readChar() + l.prevToken = tok return tok } @@ -470,6 +529,56 @@ func (l *Lexer) readString() string { return out } +// read a regexp, including flags. +func (l *Lexer) readRegexp() (string, error) { + out := "" + + for { + l.readChar() + + if l.ch == rune(0) { + return "unterminated regular expression", fmt.Errorf("unterminated regular expression") + } + if l.ch == '/' { + + // consume the terminating "/". + l.readChar() + + // prepare to look for flags + flags := "" + + // two flags are supported: + // i -> Ignore-case + // m -> Multiline + // + for l.ch == rune('i') || l.ch == rune('m') { + + // save the char - unless it is a repeat + if !strings.Contains(flags, string(l.ch)) { + + // we're going to sort the flags + tmp := strings.Split(flags, "") + tmp = append(tmp, string(l.ch)) + flags = strings.Join(tmp, "") + + } + + // read the next + l.readChar() + } + + // convert the regexp to go-lang + if len(flags) > 0 { + out = "(?" + flags + ")" + out + } + break + } + out = out + string(l.ch) + } + + return out, nil +} + // read the end of a backtick-quoted string func (l *Lexer) readBacktick() string { position := l.position + 1 diff --git a/lexer/lexer_test.go b/lexer/lexer_test.go index cfb6a26..253c9b4 100644 --- a/lexer/lexer_test.go +++ b/lexer/lexer_test.go @@ -42,7 +42,7 @@ let add = fn(x, y){ x+y; }; let result = add(five, ten); -!-/ *5; +!- *5; 5<10>5; if(5<10){ @@ -104,7 +104,6 @@ for {token.SEMICOLON, ";"}, {token.BANG, "!"}, {token.MINUS, "-"}, - {token.SLASH, "/"}, {token.ASTERISK, "*"}, {token.INT, "5"}, {token.SEMICOLON, ";"}, @@ -492,3 +491,116 @@ func TestIntDotMethod(t *testing.T) { } } } + +// TestRegexp ensures a simple regexp can be parsed. +func TestRegexp(t *testing.T) { + input := `if ( f ~= /steve/i ) +if ( f ~= /steve/m ) +if ( f ~= /steve/mi ) +if ( f ~= /steve/miiiiiiiiiiiiiiiiimmmmmmmmmmmmmiiiii )` + + tests := []struct { + expectedType token.Type + expectedLiteral string + }{ + {token.IF, "if"}, + {token.LPAREN, "("}, + {token.IDENT, "f"}, + {token.CONTAINS, "~="}, + {token.REGEXP, "(?i)steve"}, + {token.RPAREN, ")"}, + {token.IF, "if"}, + {token.LPAREN, "("}, + {token.IDENT, "f"}, + {token.CONTAINS, "~="}, + {token.REGEXP, "(?m)steve"}, + {token.RPAREN, ")"}, + {token.IF, "if"}, + {token.LPAREN, "("}, + {token.IDENT, "f"}, + {token.CONTAINS, "~="}, + {token.REGEXP, "(?mi)steve"}, + {token.RPAREN, ")"}, + {token.IF, "if"}, + {token.LPAREN, "("}, + {token.IDENT, "f"}, + {token.CONTAINS, "~="}, + {token.REGEXP, "(?mi)steve"}, + {token.RPAREN, ")"}, + {token.EOF, ""}, + } + l := New(input) + for i, tt := range tests { + tok := l.NextToken() + if tok.Type != tt.expectedType { + t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type) + } + if tok.Literal != tt.expectedLiteral { + t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) + } + } +} + +// TestIllegalRegexp is designed to look for an unterminated/illegal regexp +func TestIllegalRegexp(t *testing.T) { + input := `if ( f ~= /steve )` + + tests := []struct { + expectedType token.Type + expectedLiteral string + }{ + {token.IF, "if"}, + {token.LPAREN, "("}, + {token.IDENT, "f"}, + {token.CONTAINS, "~="}, + {token.REGEXP, "unterminated regular expression"}, + {token.EOF, ""}, + } + l := New(input) + for i, tt := range tests { + tok := l.NextToken() + if tok.Type != tt.expectedType { + t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type) + } + if tok.Literal != tt.expectedLiteral { + t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) + } + } +} + +// TestDiv is designed to test that a division is recognized; that it is +// not confused with a regular-expression. +func TestDiv(t *testing.T) { + input := `a = b / c; +a = 3/4; +` + + tests := []struct { + expectedType token.Type + expectedLiteral string + }{ + {token.IDENT, "a"}, + {token.ASSIGN, "="}, + {token.IDENT, "b"}, + {token.SLASH, "/"}, + {token.IDENT, "c"}, + {token.SEMICOLON, ";"}, + {token.IDENT, "a"}, + {token.ASSIGN, "="}, + {token.INT, "3"}, + {token.SLASH, "/"}, + {token.INT, "4"}, + {token.SEMICOLON, ";"}, + {token.EOF, ""}, + } + l := New(input) + for i, tt := range tests { + tok := l.NextToken() + if tok.Type != tt.expectedType { + t.Fatalf("tests[%d] - tokentype wrong, expected=%q, got=%q", i, tt.expectedType, tok.Type) + } + if tok.Literal != tt.expectedLiteral { + t.Fatalf("tests[%d] - Literal wrong, expected=%q, got=%q", i, tt.expectedLiteral, tok.Literal) + } + } +} diff --git a/object/object.go b/object/object.go index f2b8da7..e8ee107 100644 --- a/object/object.go +++ b/object/object.go @@ -18,6 +18,7 @@ const ( ARRAY_OBJ = "ARRAY" HASH_OBJ = "HASH" FILE_OBJ = "FILE" + REGEXP_OBJ = "REGEXP" ) // Object is the interface that all of our various object-types must implmenet. diff --git a/object/object_regexp.go b/object/object_regexp.go new file mode 100644 index 0000000..8dca87b --- /dev/null +++ b/object/object_regexp.go @@ -0,0 +1,28 @@ +// The implementation of our regular-expression object. + +package object + +// Regexp wraps regular-expressions and implements the Object interface. +type Regexp struct { + // Value holds the string value this object wraps. + Value string + + // Flags holds the flags for the object + Flags string +} + +// Type returns the type of this object. +func (s *Regexp) Type() Type { + return REGEXP_OBJ +} + +// Inspect returns a string-representation of the given object. +func (r *Regexp) Inspect() string { + return r.Value +} + +// InvokeMethod invokes a method against the object. +// (Built-in methods only.) +func (s *Regexp) InvokeMethod(method string, env Environment, args ...Object) Object { + return nil +} diff --git a/parser/parser.go b/parser/parser.go index 47622ee..7ccc421 100644 --- a/parser/parser.go +++ b/parser/parser.go @@ -23,28 +23,32 @@ type ( const ( _ int = iota LOWEST - COND // OR or AND - ASSIGN // = - EQUALS // == or != - LESSGREATER // > or < - SUM // + or - - PRODUCT // * or / - POWER // ** - MOD // % - PREFIX // -X or !X - CALL // myFunction(X) - INDEX // array[index], map[key] + COND // OR or AND + ASSIGN // = + EQUALS // == or != + REGEXP_MATCH // !~ ~= + LESSGREATER // > or < + SUM // + or - + PRODUCT // * or / + POWER // ** + MOD // % + PREFIX // -X or !X + CALL // myFunction(X) + INDEX // array[index], map[key] ) // each token precedence var precedences = map[token.Type]int{ - token.ASSIGN: ASSIGN, - token.EQ: EQUALS, - token.NOT_EQ: EQUALS, - token.LT: LESSGREATER, - token.LT_EQUALS: LESSGREATER, - token.GT: LESSGREATER, - token.GT_EQUALS: LESSGREATER, + token.ASSIGN: ASSIGN, + token.EQ: EQUALS, + token.NOT_EQ: EQUALS, + token.LT: LESSGREATER, + token.LT_EQUALS: LESSGREATER, + token.GT: LESSGREATER, + token.GT_EQUALS: LESSGREATER, + token.CONTAINS: REGEXP_MATCH, + token.NOT_CONTAINS: REGEXP_MATCH, + token.PLUS: SUM, token.PLUS_EQUALS: SUM, token.MINUS: SUM, @@ -102,6 +106,7 @@ func New(l *lexer.Lexer) *Parser { p.prefixParseFns = make(map[token.Type]prefixParseFn) p.registerPrefix(token.IDENT, p.parseIdentifier) p.registerPrefix(token.INT, p.parseIntegerLiteral) + p.registerPrefix(token.REGEXP, p.parseRegexpLiteral) p.registerPrefix(token.FLOAT, p.parseFloatLiteral) p.registerPrefix(token.TRUE, p.parseBoolean) p.registerPrefix(token.FALSE, p.parseBoolean) @@ -116,6 +121,7 @@ func New(l *lexer.Lexer) *Parser { p.registerPrefix(token.BACKTICK, p.parseBacktickLiteral) p.registerPrefix(token.LBRACKET, p.parseArrayLiteral) p.registerPrefix(token.LBRACE, p.parseHashLiteral) + p.registerPrefix(token.REGEXP, p.parseRegexpLiteral) p.infixParseFns = make(map[token.Type]infixParseFn) p.registerInfix(token.ASSIGN, p.parseAssignExpression) @@ -140,6 +146,8 @@ func New(l *lexer.Lexer) *Parser { p.registerInfix(token.MINUS_EQUALS, p.parseAssignExpression) p.registerInfix(token.ASTERISK_EQUALS, p.parseAssignExpression) p.registerInfix(token.SLASH_EQUALS, p.parseAssignExpression) + p.registerInfix(token.CONTAINS, p.parseInfixExpression) + p.registerInfix(token.NOT_CONTAINS, p.parseInfixExpression) p.postfixParseFns = make(map[token.Type]postfixParseFn) p.registerPostfix(token.PLUS_PLUS, p.parsePostfixExpression) @@ -519,6 +527,32 @@ func (p *Parser) parseStringLiteral() ast.Expression { return &ast.StringLiteral{Token: p.curToken, Value: p.curToken.Literal} } +// parseRegexpLiteral parses a regular-expression. +func (p *Parser) parseRegexpLiteral() ast.Expression { + + flags := "" + + val := p.curToken.Literal + if strings.HasPrefix(val, "(?") { + val = strings.TrimPrefix(val, "(?") + + i := 0 + for i < len(val) { + + if val[i] == ')' { + + val = val[i+1:] + break + } else { + flags += string(val[i]) + } + + i++ + } + } + return &ast.RegexpLiteral{Token: p.curToken, Value: val, Flags: flags} +} + // parseBacktickLiteral parses a backtick-expression. func (p *Parser) parseBacktickLiteral() ast.Expression { return &ast.BacktickLiteral{Token: p.curToken, Value: p.curToken.Literal} diff --git a/token/token.go b/token/token.go index 21180f2..c2231a8 100644 --- a/token/token.go +++ b/token/token.go @@ -55,10 +55,14 @@ const ( EQ = "==" NOT_EQ = "!=" STRING = "STRING" + REGEXP = "REGEXP" LBRACKET = "[" RBRACKET = "]" COLON = ":" PERIOD = "." + CONTAINS = "~=" + NOT_CONTAINS = "!~" + ILLEGAL = "ILLEGAL" ) // reversed keywords