123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416 |
- /*
- Copyright 2012 Google Inc. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
- /*
- Package shlex implements a simple lexer which splits input in to tokens using
- shell-style rules for quoting and commenting.
- The basic use case uses the default ASCII lexer to split a string into sub-strings:
- shlex.Split("one \"two three\" four") -> []string{"one", "two three", "four"}
- To process a stream of strings:
- l := NewLexer(os.Stdin)
- for ; token, err := l.Next(); err != nil {
- // process token
- }
- To access the raw token stream (which includes tokens for comments):
- t := NewTokenizer(os.Stdin)
- for ; token, err := t.Next(); err != nil {
- // process token
- }
- */
- package shlex
- import (
- "bufio"
- "fmt"
- "io"
- "strings"
- )
- // TokenType is a top-level token classification: A word, space, comment, unknown.
- type TokenType int
- // runeTokenClass is the type of a UTF-8 character classification: A quote, space, escape.
- type runeTokenClass int
- // the internal state used by the lexer state machine
- type lexerState int
- // Token is a (type, value) pair representing a lexographical token.
- type Token struct {
- tokenType TokenType
- value string
- }
- // Equal reports whether tokens a, and b, are equal.
- // Two tokens are equal if both their types and values are equal. A nil token can
- // never be equal to another token.
- func (a *Token) Equal(b *Token) bool {
- if a == nil || b == nil {
- return false
- }
- if a.tokenType != b.tokenType {
- return false
- }
- return a.value == b.value
- }
- // Named classes of UTF-8 runes
- const (
- spaceRunes = " \t\r\n"
- escapingQuoteRunes = `"`
- nonEscapingQuoteRunes = "'"
- escapeRunes = `\`
- commentRunes = "#"
- )
- // Classes of rune token
- const (
- unknownRuneClass runeTokenClass = iota
- spaceRuneClass
- escapingQuoteRuneClass
- nonEscapingQuoteRuneClass
- escapeRuneClass
- commentRuneClass
- eofRuneClass
- )
- // Classes of lexographic token
- const (
- UnknownToken TokenType = iota
- WordToken
- SpaceToken
- CommentToken
- )
- // Lexer state machine states
- const (
- startState lexerState = iota // no runes have been seen
- inWordState // processing regular runes in a word
- escapingState // we have just consumed an escape rune; the next rune is literal
- escapingQuotedState // we have just consumed an escape rune within a quoted string
- quotingEscapingState // we are within a quoted string that supports escaping ("...")
- quotingState // we are within a string that does not support escaping ('...')
- commentState // we are within a comment (everything following an unquoted or unescaped #
- )
- // tokenClassifier is used for classifying rune characters.
- type tokenClassifier map[rune]runeTokenClass
- func (typeMap tokenClassifier) addRuneClass(runes string, tokenType runeTokenClass) {
- for _, runeChar := range runes {
- typeMap[runeChar] = tokenType
- }
- }
- // newDefaultClassifier creates a new classifier for ASCII characters.
- func newDefaultClassifier() tokenClassifier {
- t := tokenClassifier{}
- t.addRuneClass(spaceRunes, spaceRuneClass)
- t.addRuneClass(escapingQuoteRunes, escapingQuoteRuneClass)
- t.addRuneClass(nonEscapingQuoteRunes, nonEscapingQuoteRuneClass)
- t.addRuneClass(escapeRunes, escapeRuneClass)
- t.addRuneClass(commentRunes, commentRuneClass)
- return t
- }
- // ClassifyRune classifiees a rune
- func (t tokenClassifier) ClassifyRune(runeVal rune) runeTokenClass {
- return t[runeVal]
- }
- // Lexer turns an input stream into a sequence of tokens. Whitespace and comments are skipped.
- type Lexer Tokenizer
- // NewLexer creates a new lexer from an input stream.
- func NewLexer(r io.Reader) *Lexer {
- return (*Lexer)(NewTokenizer(r))
- }
- // Next returns the next word, or an error. If there are no more words,
- // the error will be io.EOF.
- func (l *Lexer) Next() (string, error) {
- for {
- token, err := (*Tokenizer)(l).Next()
- if err != nil {
- return "", err
- }
- switch token.tokenType {
- case WordToken:
- return token.value, nil
- case CommentToken:
- // skip comments
- default:
- return "", fmt.Errorf("Unknown token type: %v", token.tokenType)
- }
- }
- }
- // Tokenizer turns an input stream into a sequence of typed tokens
- type Tokenizer struct {
- input bufio.Reader
- classifier tokenClassifier
- }
- // NewTokenizer creates a new tokenizer from an input stream.
- func NewTokenizer(r io.Reader) *Tokenizer {
- input := bufio.NewReader(r)
- classifier := newDefaultClassifier()
- return &Tokenizer{
- input: *input,
- classifier: classifier}
- }
- // scanStream scans the stream for the next token using the internal state machine.
- // It will panic if it encounters a rune which it does not know how to handle.
- func (t *Tokenizer) scanStream() (*Token, error) {
- state := startState
- var tokenType TokenType
- var value []rune
- var nextRune rune
- var nextRuneType runeTokenClass
- var err error
- for {
- nextRune, _, err = t.input.ReadRune()
- nextRuneType = t.classifier.ClassifyRune(nextRune)
- if err == io.EOF {
- nextRuneType = eofRuneClass
- err = nil
- } else if err != nil {
- return nil, err
- }
- switch state {
- case startState: // no runes read yet
- {
- switch nextRuneType {
- case eofRuneClass:
- {
- return nil, io.EOF
- }
- case spaceRuneClass:
- {
- }
- case escapingQuoteRuneClass:
- {
- tokenType = WordToken
- state = quotingEscapingState
- }
- case nonEscapingQuoteRuneClass:
- {
- tokenType = WordToken
- state = quotingState
- }
- case escapeRuneClass:
- {
- tokenType = WordToken
- state = escapingState
- }
- case commentRuneClass:
- {
- tokenType = CommentToken
- state = commentState
- }
- default:
- {
- tokenType = WordToken
- value = append(value, nextRune)
- state = inWordState
- }
- }
- }
- case inWordState: // in a regular word
- {
- switch nextRuneType {
- case eofRuneClass:
- {
- token := &Token{
- tokenType: tokenType,
- value: string(value)}
- return token, err
- }
- case spaceRuneClass:
- {
- token := &Token{
- tokenType: tokenType,
- value: string(value)}
- return token, err
- }
- case escapingQuoteRuneClass:
- {
- state = quotingEscapingState
- }
- case nonEscapingQuoteRuneClass:
- {
- state = quotingState
- }
- case escapeRuneClass:
- {
- state = escapingState
- }
- default:
- {
- value = append(value, nextRune)
- }
- }
- }
- case escapingState: // the rune after an escape character
- {
- switch nextRuneType {
- case eofRuneClass:
- {
- err = fmt.Errorf("EOF found after escape character")
- token := &Token{
- tokenType: tokenType,
- value: string(value)}
- return token, err
- }
- default:
- {
- state = inWordState
- value = append(value, nextRune)
- }
- }
- }
- case escapingQuotedState: // the next rune after an escape character, in double quotes
- {
- switch nextRuneType {
- case eofRuneClass:
- {
- err = fmt.Errorf("EOF found after escape character")
- token := &Token{
- tokenType: tokenType,
- value: string(value)}
- return token, err
- }
- default:
- {
- state = quotingEscapingState
- value = append(value, nextRune)
- }
- }
- }
- case quotingEscapingState: // in escaping double quotes
- {
- switch nextRuneType {
- case eofRuneClass:
- {
- err = fmt.Errorf("EOF found when expecting closing quote")
- token := &Token{
- tokenType: tokenType,
- value: string(value)}
- return token, err
- }
- case escapingQuoteRuneClass:
- {
- state = inWordState
- }
- case escapeRuneClass:
- {
- state = escapingQuotedState
- }
- default:
- {
- value = append(value, nextRune)
- }
- }
- }
- case quotingState: // in non-escaping single quotes
- {
- switch nextRuneType {
- case eofRuneClass:
- {
- err = fmt.Errorf("EOF found when expecting closing quote")
- token := &Token{
- tokenType: tokenType,
- value: string(value)}
- return token, err
- }
- case nonEscapingQuoteRuneClass:
- {
- state = inWordState
- }
- default:
- {
- value = append(value, nextRune)
- }
- }
- }
- case commentState: // in a comment
- {
- switch nextRuneType {
- case eofRuneClass:
- {
- token := &Token{
- tokenType: tokenType,
- value: string(value)}
- return token, err
- }
- case spaceRuneClass:
- {
- if nextRune == '\n' {
- state = startState
- token := &Token{
- tokenType: tokenType,
- value: string(value)}
- return token, err
- } else {
- value = append(value, nextRune)
- }
- }
- default:
- {
- value = append(value, nextRune)
- }
- }
- }
- default:
- {
- return nil, fmt.Errorf("Unexpected state: %v", state)
- }
- }
- }
- }
- // Next returns the next token in the stream.
- func (t *Tokenizer) Next() (*Token, error) {
- return t.scanStream()
- }
- // Split partitions a string into a slice of strings.
- func Split(s string) ([]string, error) {
- l := NewLexer(strings.NewReader(s))
- subStrings := make([]string, 0)
- for {
- word, err := l.Next()
- if err != nil {
- if err == io.EOF {
- return subStrings, nil
- }
- return subStrings, err
- }
- subStrings = append(subStrings, word)
- }
- }
|