浏览代码

add yanshi

Monster 1 年之前
父节点
当前提交
14716a090b

+ 8 - 0
yanshi/.gitignore

@@ -0,0 +1,8 @@
+*.o
+*.dot
+*.output
+/build
+/src/parser.cc
+/src/parser.hh
+/src/lexer.cc
+/src/lexer.hh

+ 58 - 0
yanshi/Makefile

@@ -0,0 +1,58 @@
+CPPFLAGS := -g3 -std=c++1y -Isrc -I. -DHAVE_READLINE
+
+ifeq ($(build),release)
+  BUILD := release
+  CPPFLAGS += -Os
+else
+  BUILD := build
+  CPPFLAGS += -fsanitize=undefined,address -DDEBUG
+  LDLIBS := -lasan -lubsan
+endif
+
+LDLIBS += -licuuc -lreadline
+SRC := $(filter-out src/lexer.cc src/parser.cc, $(wildcard src/*.cc)) src/lexer.cc src/parser.cc
+OBJ := $(addprefix $(BUILD)/,$(subst src/,,$(SRC:.cc=.o)))
+UNITTEST_SRC := $(wildcard unittest/*.cc)
+UNITTEST_EXE := $(subst unittest/,,$(UNITTEST_SRC:.cc=))
+
+all: $(BUILD)/yanshi # unittest
+
+unittest: $(addprefix $(BUILD)/unittest/,$(UNITTEST_EXE))
+	$(foreach x,$(addprefix $(BUILD)/unittest/,$(UNITTEST_EXE)),$x && ) :
+
+sinclude $(OBJ:.o=.d)
+
+# FIXME
+$(BUILD)/repl.o: src/lexer.hh
+
+$(BUILD) $(BUILD)/unittest:
+	mkdir -p $@
+
+$(BUILD)/yanshi: $(OBJ)
+	$(LINK.cc) $^ $(LDLIBS) -o $@
+
+$(BUILD)/%.o: src/%.cc | $(BUILD)
+	$(CXX) $(CPPFLAGS) -MM -MP -MT $@ -MF $(@:.o=.d) $<
+	$(COMPILE.cc) $< -o $@
+
+$(BUILD)/unittest/%: unittest/%.cc $(wildcard unittest/*.hh) $(filter-out $(BUILD)/main.o,$(OBJ)) | $(BUILD)/unittest
+	$(CXX) $(CPPFLAGS) -MM -MP -MT $@ -MF $(@:.o=.d) $<
+	$(LINK.cc) $(filter-out %.hh,$^) $(LDLIBS) -o $@
+
+src/lexer.cc src/lexer.hh: src/lexer.l
+	flex --header-file=src/lexer.hh -o src/lexer.cc $<
+
+src/parser.cc src/parser.hh: src/parser.y src/common.hh src/location.hh src/option.hh src/syntax.hh
+	bison --defines=src/parser.hh -o src/parser.cc $<
+
+$(BUILD)/loader.o: src/parser.hh
+$(BUILD)/parser.o: src/lexer.hh
+$(BUILD)/lexer.o: src/parser.hh
+
+clean:
+	$(RM) -r build release
+
+distclean: clean
+	$(RM) src/{lexer,parser}.{cc,hh}
+
+.PHONY: all clean distclean

+ 217 - 0
yanshi/README.md

@@ -0,0 +1,217 @@
+# 偃师 (yanshi)
+
+yanshi is a finite state automaton generator like Ragel. Use inline operators to embed C++ code in the recognition of a language. yanshi is enhanced with features to approximate context-free grammar:
+
+- Approximation of substring grammar
+- Approximation of recursive automaton to match expressions with recursion.
+
+The motivation to create yanshi is that Ragel does not provide a mechanism to serialize its representation of finite state automata, making it difficult to post-process generated automata and obtain the substring grammar recognizer.
+
+Later on, I found a simplified SQL grammar might contain more than 10000 states. It was not only slow to generate the automaton, making it hard to do trial-and-error experiments, but wasted memory to store the automaton. I introduced `CollapseExpr` to allow circular references.
+
+`CallExpr` takes one step further, maintains a return address stack to imitate function calls. It can be seen as an augmented `CollapseExpr`, removing a lot of false positive cases.
+
+## Name
+
+From <https://en.wikipedia.org/wiki/Automaton>:
+
+<blockquote>
+In ancient China, a curious account of automata is found in the Lie Zi text (列子), written in the 3rd century BC. Within it there is a description of a much earlier encounter between King Mu of Zhou (周穆王, 1023-957 BC) and a mechanical engineer known as Yan Shi (偃师), an 'artificer'.
+</blockquote>
+
+## Build
+
+* Debug: `make`
+* Release: `make build=release`
+
+## Getting Started
+
+* Create file `a.ys`:
+  ```
+  export foo = 'hello'
+  ```
+
+  Run `yanshi -S a.ys -o /tmp/a.cc` to generate a C++ file from the yanshi source file `a.ys`.
+
+  + `yanshi_foo_start`: the start state is 0. States are represented by natural numbers.
+  + `yanshi_foo_is_final`: leave aside `ret_stack` and look at the last line, it checks whether `u` is one of the final states.
+  + `yanshi_foo_transit`: leave aside `ret_stack`, `u` is the current state and `c` is the next input codepoint or label.
+
+  With the `-S` option, yanshi will generate a standalone C++ file.
+  ```
+  % make -C /tmp a
+  make: Entering directory '/tmp'
+  g++     a.cc   -o a
+  make: Leaving directory '/tmp'
+  % /tmp/a hello
+  0 h 1 e 2 l 3 l 4 o 5
+  len: 5
+  pref: 5
+  state: 5
+  final: true
+  % /tmp/a
+  hello<press C-d>0 h 1 e 2 l 3 l 4 o 5
+  len: 5
+  pref: 5
+  state: 5
+  final: true
+  ```
+
+  States are yellow and interleaved with transition labels. Final states are bold yellow.
+  + `len`: the length of input codepoints or labels
+  + `pref`: the length of the longest prefix that does not enter the dead state
+  + `state`: the state entered after consuming the input
+  + `final`: whether the state is one of final states
+
+* Interactive mode
+
+  The `-i` option enables interactive mode.
+  ```
+  % yanshi -i a.ys
+  Testing foo
+  foo :: DefineStmt
+  .integer mode
+  Commands available from the prompt:
+    .automaton    dump automaton
+    .assoc        dump associated AST Expr for each state
+    .help         display this help
+    .integer      input is a list of non-negative integers, macros(#define) or ''  quoted strings
+    .macro        display defined macros
+    .string       input is a string
+    .stmt <ident> change target DefineStmt to <ident>
+    .quit         exit interactive mode
+  λ 104 101 108 108 111
+  0 104 1 101 2 108 3 108 4 111 5
+  export foo = 'hello':
+  λ .string
+  .string mode
+  λ hello
+  0 h 1 e 2 l 3 l 4 o 5
+  export foo = 'hello':
+  λ
+  ```
+
+* Regex-like syntax
+  ```
+  export hello = [gh] 'e' l{2} 'o'
+  l = 'l'
+  ```
+
+  `[gh]` is a bracket expression and `l{2}` denotes to matches `l` at least twice. This grammar matches `hello`, `gello`, `helllo`, ...
+
+  + Union: `c = a | b`
+  + Intersection: `c = a && b`
+  + Difference: `c = a - b`
+  + Concatenation: `c = a b`
+  + Complement: `c = ~ a`
+
+* Actions (embedded C++ code)
+  ```
+  c++ {
+  #include <stdio.h>
+  }
+  export hello = '喵' @ { puts("meow"); } {2}
+  ```
+  I have not thought clearly on the implementation. The executing point may be counter-intuitive.
+
+* Modules
+  ```
+  # a.ys
+  import 'b.ys' as B # B::bar
+  import 'b.ys' # qux
+
+  export foo = B::bar | qux
+  bar = '4'
+
+  # b.ys
+  bar = '3'
+  qux = '5'
+  ```
+
+* Substring grammar
+  Specify the `--substring-grammar` option to generate code for substring grammar. That is, the generated code matches every substring of the grammar. The implementation creates a new start state and a new final state, connects the start state to the old start state, and old final states to the new final state.
+
+* `EmbedExpr`, reference a nonterminal without modifiers
+   ```
+   foo = bar
+   bar = [0-9]
+   ```
+   The complete automaton of bar will be duplicated in each reference site. If the referenced automaton is large, `EmbedExpr` will significantly increase the number of states. `EmbedExpr` defines dependencies among states and no cyclic dependency is allowed.
+
+* `CollapseExpr`, reference a nonterminal with the `!` modifier
+  ```
+  export foo = 'pre' !bar 'post'
+  bar = [\u0300-\u034E]
+  quz = 'meow' !bar 'meow'
+  ```
+  The final state of `'pre'` and the start state of `'post'` will be connected by a special directed arc. When exporting, an epsilon transition will be added from the tail of the arc to the start state of `bar`, others will be added from the final states of `bar` to the head of the arc. `CollapseExpr` behaves like function calls, however, the return address is not preserved (hence the name `CollapseExpr`) and the state may go to other call sites. In this example, the state may go to either `foo` or `quz` after traveling through `bar`, causing false positives.
+
+* `CallExpr`, reference a nonterminal with the `&` modifier
+  ```
+  export foo = 'pre' &bar 'post'
+  bar = '4'
+  ```
+  This is a refinement of `CollapseExpr`. Suppose state `&B` is contained in `A`'s definition (`A` calls `B`). `&B` will be represented as an pseudo arc (`u -> v`), where `u` is the state before `&B` and `v` is the state after `&B`. If arcs of `u` do not collide with arcs of `B`, the transition function will push `v` to the return stack if current state set contains `u` and there is no other transition. Note `B` is disconnected from `A`, which is different from the `CollapseExpr` case. The machine will transit on automaton `B` greedily. If there is no transition, it will pop a return address(`v` in this case) and jumps to it.
+
+## Contrib
+
+### Vim
+
+Syntax highlighting, and a syntax checking plugin for Synaptics
+
+```zsh
+ln -sr contrib/vim/compiler/yanshi.vim ~/.vim/compiler/
+ln -sr contrib/vim/ftdetect/yanshi.vim ~/.vim/ftdetect/
+ln -sr contrib/vim/ftplugin/yanshi.vim ~/.vim/ftplugin/
+ln -sr contrib/vim/syntax/yanshi.vim ~/.vim/syntax/
+ln -sr contrib/vim/syntax_checker/yanshi ~/.vim/syntax_checker/
+```
+
+### Zsh
+
+Command line completion
+
+```
+# ~/.zshrc
+fpath=(~/.zsh $fpath)
+
+# ln -sr contrib/zsh/_yanshi ~/.zsh/
+```
+
+## Internals
+
+```
+src
+  common.{cc,hh}
+  main.{cc,hh}
+  syntax.{cc,hh}
+  loader.{cc,hh}
+  fsa.{cc,hh}
+  fsa_anno.{cc,hh}
+  compiler.{cc,hh}
+  parser.y
+  lexer.l
+  location.cc
+```
+
+* Lex `lexer.l`
+* Parse and generate a syntax tree `parser.y`
+* `loader.cc`
+  + Get a list of definitions
+  + Recursively load for each `import`
+  + Resolve references and associate uses to definitions
+  + Build a dependency graph from `EmbedExpr`
+  + Compile automaton for each nonterminal in topological order. `CollapseExpr` and `CallExpr` are represented by special directed arcs.
+  + Generate code for `export` nonterminals, resolving `CollapseExpr` and `CallExpr`
+
+### Finite state automaton
+
+Each node of the syntax tree corresponds to an automaton. The parent builds an automaton from its children according to the semantics. The automaton of the parent may contain states from the automaton of one of the children, or it is a state introduced by the parent.
+
+`assoc[i]` records the associative nodes in the automaton tree (which part of the syntax tree has associations with this state) and positions (start state, final state or inner state) for state `i`. It serves three purposes:
+
+* Check which action should be triggered
+* Look for inner states (neither start nor final) in the implementation of substring grammar
+* Check whether it is associated to a `CallExpr` or `CollapseExpr`
+
+### `CollapseExpr`

+ 15 - 0
yanshi/contrib/vim/compiler/yanshi.vim

@@ -0,0 +1,15 @@
+"if exists('current_compiler')
+"  finish
+"endif
+let current_compiler = 'yanshi'
+
+if exists(':CompilerSet') != 2
+  command -nargs=* CompilerSet setlocal <args>
+endif
+CompilerSet errorformat=
+      \%E%f\ %l:%c-%*\\d\ error\ %m,
+      \%E%f\ %l-%*\\d:%c-%*\\d\ error\ %m,
+      \%W%f\ %l:%c-%*\\d\ warning\ %m,
+      \%W%f\ %l-%*\\d:%c-%*\\d\ warning\ %m,
+      \%C%.%#
+CompilerSet makeprg=yanshi\ -d0\ -c\ $*\ %

+ 2 - 0
yanshi/contrib/vim/ftdetect/yanshi.vim

@@ -0,0 +1,2 @@
+au BufRead,BufNewFile *.yanshi setf yanshi
+au BufRead,BufNewFile *.ys setf yanshi

+ 6 - 0
yanshi/contrib/vim/ftplugin/yanshi.vim

@@ -0,0 +1,6 @@
+if exists('b:did_ftplugin')
+  finish
+endif
+let b:did_ftplugin = 1
+
+compiler yanshi

+ 70 - 0
yanshi/contrib/vim/syntax/yanshi.vim

@@ -0,0 +1,70 @@
+if exists('b:current_syntax')
+  finish
+endif
+
+syn cluster yanshiCommentGroup contains=yanshiTodo
+syn include @yanshiCcode syntax/cpp.vim
+syn keyword yanshiAction action
+syn keyword yanshiMacro semicolon nosemicolon
+syn keyword yanshiStorageClass export intact
+syn keyword yanshiTodo contained TODO FIXME XXX
+syn match yanshiCpp 'c++'
+syn match yanshiActionOperator '[>$@%]'
+syn match yanshiCall '\^\w\+\(::\w\+\)\?'
+syn match yanshiCollapse '!\w\+\(::\w\+\)\?'
+syn match yanshiHighOp '[+\*?]'
+syn match yanshiIdent '\w\+\(::\w\+\)\?'
+syn match yanshiCpp display "^c++\s*" skipwhite nextgroup=yanshiBrace
+syn match yanshiImport display "^import\s*" contains=yanshiImported
+syn match yanshiLowOp '[-&|]'
+syn match yanshiSpecial display contained "\\\(x\x\x\|.\|$\)"
+syn region yanshiBrace matchgroup=Delimiter start='{' end='}' fold contains=@yanshiCcode
+syn region yanshiBracket start='\[' skip=+\\\\\|\\]+ end=']'
+syn region yanshiComment start='/\*' end='\*/' keepend contains=@yanshiCommentGroup,@Spell
+syn region yanshiImported display contained start="+" skip=+\\\\\|\\"+ end=+"+
+syn region yanshiLineComment start='#\|//' skip='\\$' end='$' keepend contains=@yanshiCommentGroup,@Spell
+syn region yanshiPreprocess start="#define" skip="\\$" end="$" keepend
+syn region yanshiQQString start=+"+ skip=+\\.+ end=+"+ contains=yanshiSpecial
+syn region yanshiQString start=+'+ skip=+\\.+ end=+'+
+
+syn region yanshiDefineStmt start='^\w\+\s*[=:]' end='$' skipnl contains=@yanshiExpr,yanshiComment,yanshiLineComment,yanshiParen0
+
+syn cluster yanshiExpr contains=yanshiActionOperator,yanshiBrace,yanshiBracket,yanshiCall,yanshiCollapse,yanshiIdent,yanshiHighOp,yanshiLowOp,yanshiQString,yanshiQQString,
+sy region yanshiParen0 matchgroup=yanshiParen0 start='(' end=')' contains=@yanshiExpr,yanshiParen1
+sy region yanshiParen1 matchgroup=yanshiParen1 start='(' end=')' contains=@yanshiExpr,yanshiParen2 contained
+sy region yanshiParen2 matchgroup=yanshiParen2 start='(' end=')' contains=@yanshiExpr,yanshiParen3 contained
+sy region yanshiParen3 matchgroup=yanshiParen3 start='(' end=')' contains=@yanshiExpr,yanshiParen4 contained
+sy region yanshiParen4 matchgroup=yanshiParen4 start='(' end=')' contains=@yanshiExpr,yanshiParen5 contained
+sy region yanshiParen5 matchgroup=yanshiParen5 start='(' end=')' contains=@yanshiExpr,yanshiParen0 contained
+hi yanshiParen0 ctermfg=brown guifg=#3bb9ff
+hi yanshiParen1 ctermfg=DarkBlue guifg=#f88017
+hi yanshiParen2 ctermfg=darkgray guifg=#5efb6e
+hi yanshiParen3 ctermfg=darkgreen guifg=#f62817
+hi yanshiParen4 ctermfg=darkcyan guifg=#fdd017
+hi yanshiParen5 ctermfg=darkmagenta guifg=#faafba
+
+hi link yanshiIdent          Identifier
+"TODO color mismatch of {}
+"hi link yanshiBrace          Statement
+"hi link yanshiDefineStmt     Statement
+hi def link yanshiCall           Constant
+hi def link yanshiCollapse       Constant
+hi def link yanshiAction         Structure
+hi def link yanshiActionOperator Type
+hi def link yanshiBracket        Function
+hi def link yanshiCpp            Structure
+hi def link yanshiComment        Comment
+hi def link yanshiHighOp         Operator
+hi def link yanshiImport         Include
+hi def link yanshiImported       String
+hi def link yanshiLineComment    Comment
+hi def link yanshiLowOp          Conditional
+hi def link yanshiMacro          Macro
+hi def link yanshiPreprocess     Macro
+hi def link yanshiQQString       String
+hi def link yanshiQString        String
+hi def link yanshiSpecial        SpecialChar
+hi def link yanshiStorageClass   StorageClass
+hi def link yanshiTodo           Todo
+
+let b:current_syntax = 'yanshi'

+ 29 - 0
yanshi/contrib/vim/syntax_checkers/yanshi/yanshi.vim

@@ -0,0 +1,29 @@
+if exists('g:loaded_syntastic_yanshi_yanshi_checker')
+  finish
+endif
+let g:loaded_syntastic_yanshi_yanshi_checker = 1
+
+let s:save_cpo = &cpo
+set cpo&vim
+
+fu! SyntaxCheckers_yanshi_yanshi_GetLocList() dict
+  let makeprg = self.makeprgBuild({ 'args': '-d0 -c' })
+
+  let errorformat =
+        \ '%C  %.%#,'.
+        \ '%E%f %l:%c-%*\d error %m,'.
+        \ '%E%f %l-%*\d:%c-%*\d error %m,'.
+        \ '%W%f %l:%c-%*\d warning %m,'.
+        \ '%W%f %l-%*\d:%c-%*\d warning %m'
+
+  return SyntasticMake({
+        \ 'makeprg': makeprg,
+        \ 'errorformat': errorformat })
+endf
+
+call g:SyntasticRegistry.CreateAndRegisterChecker({
+      \ 'filetype': 'yanshi',
+      \ 'name': 'yanshi'})
+
+let &cpo = s:save_cpo
+unlet s:save_cpo

+ 24 - 0
yanshi/contrib/zsh/_yanshi

@@ -0,0 +1,24 @@
+#compdef yanshi
+
+_arguments \
+  '(-b --bytes)'{-b,--bytes}'[make labels range over \[0,256), Unicode literals will be treated as UTF-8 bytes]' \
+  '(-c --check)'{-c,--check}'[check syntax & use/def]' \
+  '-C[generate C source code (default: C++)]' \
+  '(-d --debug)'{-d,--debug}'+[debug level]:level:(0 1 2 3 4 5)' \
+  '--dump-action[dump associated actions for each edge]' \
+  '--dump-assoc[dump associated AST Expr for each state]' \
+  '--dump-automaton[dump automata]' \
+  '--dump-embed[dump statistics of EmbedExpr]' \
+  '--dump-module[dump module use/def/...]' \
+  '--dump-tree[dump AST]' \
+  '(-G --graph)'{-G,--graph}'[output a Graphviz dot file]' \
+  '(-I --import)'{-I,--import}'=[add <dir> to search path for "import"]' \
+  '(-i --interactive)'{-i,--interactive}'[interactive mode]' \
+  '(-k --keep-inaccessible)'{-k,--keep-inaccessible}'[do not perform accessible/co-accessible]' \
+  '(-l --debug-output)'{-l,--debug-output}'=[filename for debug output]:file:_files' \
+  '--max-return-stack=[max length of return stack in C generator]:len:' \
+  '(-o --output)'{-o,--output}'=[.cc output filename]:file:_files' \
+  '(-O --output-header)'{-O,--output-header}'=[.hh output filename]:file:_files' \
+  '(-s --substring-grammar)'{-s,--substring-grammar}'[construct regular approximation of the substring grammar. Inner states of nonterminals labeled 'intact' are not connected to start/final]' \
+  '(-h --help)'{-h,--help}'[display this help]' \
+  '1:file:_files -g "*.{ys,yanshi}"'\

+ 187 - 0
yanshi/src/common.cc

@@ -0,0 +1,187 @@
+#include "common.hh"
+#include "option.hh"
+
+#include <errno.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sysexits.h>
+#include <time.h>
+#include <unistd.h>
+
+///// Error
+
+static const char *ENAME[] = {
+    /*   0 */ "",
+    /*   1 */ "EPERM", "ENOENT", "ESRCH", "EINTR", "EIO", "ENXIO",
+    /*   7 */ "E2BIG", "ENOEXEC", "EBADF", "ECHILD",
+    /*  11 */ "EAGAIN/EWOULDBLOCK", "ENOMEM", "EACCES", "EFAULT",
+    /*  15 */ "ENOTBLK", "EBUSY", "EEXIST", "EXDEV", "ENODEV",
+    /*  20 */ "ENOTDIR", "EISDIR", "EINVAL", "ENFILE", "EMFILE",
+    /*  25 */ "ENOTTY", "ETXTBSY", "EFBIG", "ENOSPC", "ESPIPE",
+    /*  30 */ "EROFS", "EMLINK", "EPIPE", "EDOM", "ERANGE",
+    /*  35 */ "EDEADLK/EDEADLOCK", "ENAMETOOLONG", "ENOLCK", "ENOSYS",
+    /*  39 */ "ENOTEMPTY", "ELOOP", "", "ENOMSG", "EIDRM", "ECHRNG",
+    /*  45 */ "EL2NSYNC", "EL3HLT", "EL3RST", "ELNRNG", "EUNATCH",
+    /*  50 */ "ENOCSI", "EL2HLT", "EBADE", "EBADR", "EXFULL", "ENOANO",
+    /*  56 */ "EBADRQC", "EBADSLT", "", "EBFONT", "ENOSTR", "ENODATA",
+    /*  62 */ "ETIME", "ENOSR", "ENONET", "ENOPKG", "EREMOTE",
+    /*  67 */ "ENOLINK", "EADV", "ESRMNT", "ECOMM", "EPROTO",
+    /*  72 */ "EMULTIHOP", "EDOTDOT", "EBADMSG", "EOVERFLOW",
+    /*  76 */ "ENOTUNIQ", "EBADFD", "EREMCHG", "ELIBACC", "ELIBBAD",
+    /*  81 */ "ELIBSCN", "ELIBMAX", "ELIBEXEC", "EILSEQ", "ERESTART",
+    /*  86 */ "ESTRPIPE", "EUSERS", "ENOTSOCK", "EDESTADDRREQ",
+    /*  90 */ "EMSGSIZE", "EPROTOTYPE", "ENOPROTOOPT",
+    /*  93 */ "EPROTONOSUPPORT", "ESOCKTNOSUPPORT",
+    /*  95 */ "EOPNOTSUPP/ENOTSUP", "EPFNOSUPPORT", "EAFNOSUPPORT",
+    /*  98 */ "EADDRINUSE", "EADDRNOTAVAIL", "ENETDOWN", "ENETUNREACH",
+    /* 102 */ "ENETRESET", "ECONNABORTED", "ECONNRESET", "ENOBUFS",
+    /* 106 */ "EISCONN", "ENOTCONN", "ESHUTDOWN", "ETOOMANYREFS",
+    /* 110 */ "ETIMEDOUT", "ECONNREFUSED", "EHOSTDOWN", "EHOSTUNREACH",
+    /* 114 */ "EALREADY", "EINPROGRESS", "ESTALE", "EUCLEAN",
+    /* 118 */ "ENOTNAM", "ENAVAIL", "EISNAM", "EREMOTEIO", "EDQUOT",
+    /* 123 */ "ENOMEDIUM", "EMEDIUMTYPE", "ECANCELED", "ENOKEY",
+    /* 127 */ "EKEYEXPIRED", "EKEYREVOKED", "EKEYREJECTED",
+    /* 130 */ "EOWNERDEAD", "ENOTRECOVERABLE", "ERFKILL", "EHWPOISON"
+};
+
+#define MAX_ENAME 133
+
+long action_label_base, action_label, call_label_base, call_label, collapse_label_base, collapse_label;
+
+void output_error(bool use_err, const char *format, va_list ap)
+{
+  char text[BUF_SIZE], msg[BUF_SIZE], buf[BUF_SIZE];
+  vsnprintf(msg, BUF_SIZE, format, ap);
+  if (use_err)
+    snprintf(text, BUF_SIZE, "[%s %s] ", 0 < errno && errno < MAX_ENAME ? ENAME[errno] : "?UNKNOWN?", strerror(errno));
+  else
+    strcpy(text, "");
+  snprintf(buf, BUF_SIZE, RED "%s%s\n", text, msg);
+  fputs(buf, stderr);
+  fputs(SGR0, stderr);
+  fflush(stderr);
+}
+
+void err_msg(const char *format, ...)
+{
+  va_list ap;
+  va_start(ap, format);
+  int saved = errno;
+  output_error(errno > 0, format, ap);
+  errno = saved;
+  va_end(ap);
+}
+#define err_msg_g(...) ({err_msg(__VA_ARGS__); goto quit;})
+
+void err_exit(int exitno, const char *format, ...)
+{
+  va_list ap;
+  va_start(ap, format);
+  int saved = errno;
+  output_error(errno > 0, format, ap);
+  errno = saved;
+  va_end(ap);
+
+  void *bt[99];
+  char buf[1024];
+  int nptrs = backtrace(bt, LEN(buf));
+  int i = sprintf(buf, "addr2line -Cfipe %s", program_invocation_name), j = 0;
+  while (j < nptrs && i+30 < sizeof buf)
+    i += sprintf(buf+i, " %p", bt[j++]);
+  strcat(buf, ">&2");
+  fputs("\n", stderr);
+  system(buf);
+  //backtrace_symbols_fd(buf, nptrs, STDERR_FILENO);
+  exit(exitno);
+}
+
+long get_long(const char *arg)
+{
+  char *end;
+  errno = 0;
+  long ret = strtol(arg, &end, 0);
+  if (errno)
+    err_exit(EX_USAGE, "get_long: %s", arg);
+  if (*end)
+    err_exit(EX_USAGE, "get_long: nonnumeric character");
+  return ret;
+}
+
+//// log
+//
+
+void log_generic(const char *prefix, const char *format, va_list ap)
+{
+  char buf[BUF_SIZE];
+  timeval tv;
+  tm tm;
+  gettimeofday(&tv, NULL);
+  fputs(prefix, stdout);
+  if (localtime_r(&tv.tv_sec, &tm)) {
+    strftime(buf, sizeof buf, "%T.%%06u ", &tm);
+    printf(buf, tv.tv_usec);
+  }
+  vprintf(format, ap);
+  fputs(SGR0, stdout);
+  fflush(stdout);
+}
+
+void log_event(const char *format, ...)
+{
+  va_list ap;
+  va_start(ap, format);
+  log_generic(CYAN, format, ap);
+  va_end(ap);
+}
+
+void log_action(const char *format, ...)
+{
+  va_list ap;
+  va_start(ap, format);
+  log_generic(GREEN, format, ap);
+  va_end(ap);
+}
+
+void log_status(const char *format, ...)
+{
+  va_list ap;
+  va_start(ap, format);
+  log_generic(YELLOW, format, ap);
+  va_end(ap);
+}
+
+void bold(long fd) { if (isatty(fd)) fputs("\x1b[1m", fd == STDOUT_FILENO ? stdout : stderr); }
+void blue(long fd) { if (isatty(fd)) fputs(BLUE, fd == STDOUT_FILENO ? stdout : stderr); }
+void cyan(long fd) { if (isatty(fd)) fputs(CYAN, fd == STDOUT_FILENO ? stdout : stderr); }
+void green(long fd) { if (isatty(fd)) fputs(GREEN, fd == STDOUT_FILENO ? stdout : stderr); }
+void magenta(long fd) { if (isatty(fd)) fputs(MAGENTA, fd == STDOUT_FILENO ? stdout : stderr); }
+void red(long fd) { if (isatty(fd)) fputs(RED, fd == STDOUT_FILENO ? stdout : stderr); }
+void sgr0(long fd) { if (isatty(fd)) fputs(SGR0, fd == STDOUT_FILENO ? stdout : stderr); }
+void yellow(long fd) { if (isatty(fd)) fputs(YELLOW, fd == STDOUT_FILENO ? stdout : stderr); }
+void normal_yellow(long fd) { if (isatty(fd)) fputs(NORMAL_YELLOW, fd == STDOUT_FILENO ? stdout : stderr); }
+
+void indent(FILE* f, int d)
+{
+  fprintf(f, "%*s", 2*d, "");
+}
+
+void DisjointIntervals::flip() {
+  long i = 0;
+  map<long, long> to2;
+  for (auto &x: to) {
+    if (i < x.first)
+      to2.emplace(i, x.first);
+    i = x.second;
+  }
+  if (i < AB)
+    to2.emplace(i, AB);
+  to = move(to2);
+}
+
+void DisjointIntervals::print() {
+  for (auto& x: to)
+    printf("(%ld,%ld) ", x.first, x.second);
+  puts("");
+}

+ 130 - 0
yanshi/src/common.hh

@@ -0,0 +1,130 @@
+#pragma once
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE
+#endif
+#include <assert.h>
+#include <map>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <type_traits>
+#include <vector>
+using std::map;
+using std::vector;
+
+typedef int8_t i8;
+typedef int16_t i16;
+typedef int32_t i32;
+typedef int64_t i64;
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef unsigned long ulong;
+
+#ifdef __APPLE__
+#include <crt_externs.h>
+extern char*** _NSGetArgv(void);
+#define program_invocation_name (((char **)*_NSGetArgv())[0])
+#define program_invocation_short_name (((char **)*_NSGetArgv())[0])
+#endif
+
+#define LEN(x) (sizeof(x)/sizeof(*x))
+#define ALL(x) (x).begin(), (x).end()
+#define REP(i, n) FOR(i, 0, n)
+#define FOR(i, a, b) for (typename std::remove_cv<typename std::remove_reference<decltype(b)>::type>::type i = (a); i < (b); i++)
+#define ROF(i, a, b) for (typename std::remove_cv<typename std::remove_reference<decltype(b)>::type>::type i = (b); --i >= (a); )
+
+#define SGR0 "\x1b[m"
+#define RED "\x1b[1;31m"
+#define GREEN "\x1b[1;32m"
+#define YELLOW "\x1b[1;33m"
+#define BLUE "\x1b[1;34m"
+#define MAGENTA "\x1b[1;35m"
+#define CYAN "\x1b[1;36m"
+#define NORMAL_YELLOW "\x1b[33m"
+const long MAX_CODEPOINT = 0x10ffff;
+extern long action_label_base, action_label, call_label_base, call_label, collapse_label_base, collapse_label;
+
+void bold(long fd = 1);
+void blue(long fd = 1);
+void cyan(long fd = 1);
+void green(long fd = 1);
+void magenta(long fd = 1);
+void red(long fd = 1);
+void sgr0(long fd = 1);
+void yellow(long fd = 1);
+void normal_yellow(long fd = 1);
+void indent(FILE* f, int d);
+
+const size_t BUF_SIZE = 512;
+
+void output_error(bool use_err, const char *format, va_list ap);
+void err_msg(const char *format, ...);
+void err_exit(int exitno, const char *format, ...);
+
+long get_long(const char *arg);
+
+void log_generic(const char *prefix, const char *format, va_list ap);
+void log_event(const char *format, ...);
+void log_action(const char *format, ...);
+void log_status(const char *format, ...);
+
+extern long debug_level;
+extern FILE* debug_file;
+#define DP(level, ...)  do {           \
+    if (level <= debug_level) {        \
+      fprintf(debug_file, "%s:%d ", __FILE__, __LINE__); \
+      fprintf(debug_file, __VA_ARGS__);\
+      fprintf(debug_file, "\n");       \
+      fflush(debug_file);              \
+    }                                  \
+  } while (0)
+
+template<class T, class... Args>
+void emplace_front(vector<T>& a, Args&&... args)
+{
+  a.emplace(a.begin(), args...);
+}
+
+template<class T>
+void sorted_insert(vector<T>& a, const T& x)
+{
+  a.emplace_back();
+  auto it = a.end();
+  while (a.begin() != --it && x < it[-1])
+    *it = it[-1];
+  *it = x;
+}
+
+template<class T, class... Args>
+void sorted_emplace(vector<T>& a, Args&&... args)
+{
+  T x{args...};
+  a.emplace_back();
+  auto it = a.end();
+  while (a.begin() != --it && x < it[-1])
+    *it = it[-1];
+  *it = x;
+}
+
+struct DisjointIntervals
+{
+  typedef std::pair<long, long> value_type;
+  std::map<long, long> to;
+  template<class... Args>
+  void emplace(Args&&... args) {
+    value_type x{args...};
+    auto it = to.lower_bound(x.first);
+    if (it != to.begin() && x.first <= prev(it)->second)
+      x.first = (--it)->first;
+    auto it2 = to.upper_bound(x.second);
+    if (it2 != to.begin() && prev(it2)->first <= x.second && x.second < prev(it2)->second)
+      x.second = prev(it2)->second;
+    while (it != it2)
+      it = to.erase(it);
+    to.emplace(x);
+  }
+  void flip();
+  void print(); // XXX
+};

+ 853 - 0
yanshi/src/compiler.cc

@@ -0,0 +1,853 @@
+#include "compiler.hh"
+#include "fsa_anno.hh"
+#include "loader.hh"
+#include "option.hh"
+
+#include <algorithm>
+#include <ctype.h>
+#include <limits.h>
+#include <map>
+#include <sstream>
+#include <stack>
+#include <unordered_map>
+using namespace std;
+
+unordered_map<DefineStmt*, FsaAnno> compiled;
+static unordered_map<DefineStmt*, vector<pair<long, long>>> stmt2call_addr;
+static unordered_map<DefineStmt*, vector<bool>> stmt2final;
+
+void print_assoc(const FsaAnno& anno)
+{
+  magenta(); printf("=== Associated Expr of each state\n"); sgr0();
+  REP(i, anno.fsa.n()) {
+    printf("%ld:", i);
+    for (auto aa: anno.assoc[i]) {
+      auto a = aa.first;
+      printf(" %s%s%s%s(%ld-%ld", a->name().c_str(),
+             has_start(aa.second) ? "^" : "",
+             has_inner(aa.second) ? "." : "",
+             has_final(aa.second) ? "$" : "",
+             a->loc.start, a->loc.end);
+      if (a->entering.size())
+        printf(",>%zd", a->entering.size());
+      if (a->leaving.size())
+        printf(",%%%zd", a->leaving.size());
+      if (a->finishing.size())
+        printf(",@%zd", a->finishing.size());
+      if (a->transiting.size())
+        printf(",$%zd", a->transiting.size());
+      printf(")");
+    }
+    puts("");
+  }
+  puts("");
+}
+
+void print_automaton(const Fsa& fsa)
+{
+  magenta(); printf("=== Automaton\n"); sgr0();
+  green(); printf("start: %ld\n", fsa.start);
+  red(); printf("finals:");
+  for (long i: fsa.finals)
+    printf(" %ld", i);
+  puts("");
+  sgr0(); puts("edges:");
+  REP(i, fsa.n()) {
+    printf("%ld:", i);
+    for (auto it = fsa.adj[i].begin(); it != fsa.adj[i].end(); ) {
+      long from = it->first.first, to = it->first.second, v = it->second;
+      while (++it != fsa.adj[i].end() && to == it->first.first && it->second == v)
+        to = it->first.second;
+      if (from == to-1)
+        printf(" (%ld,%ld)", from, v);
+      else
+        printf(" (%ld-%ld,%ld)", from, to-1, v);
+    }
+    puts("");
+  }
+  puts("");
+}
+
+Expr* find_lca(Expr* u, Expr* v)
+{
+  if (u->depth > v->depth)
+    swap(u, v);
+  if (u->depth < v->depth)
+    for (long k = 63-__builtin_clzl(v->depth-u->depth); k >= 0; k--)
+      if (u->depth <= v->depth-(1L<<k))
+        v = v->anc[k];
+  if (u == v)
+    return u;
+  if (v->depth)
+    for (long k = 63-__builtin_clzl(v->depth); k >= 0; k--)
+      if (k < u->anc.size() && u->anc[k] != v->anc[k])
+        u = u->anc[k], v = v->anc[k];
+  return u->anc[0]; // NULL if two trees
+}
+
+struct Compiler : Visitor<Expr> {
+  stack<FsaAnno> st;
+  stack<Expr*> path;
+  long tick = 0;
+
+  void pre_expr(Expr& expr) {
+    expr.pre = tick++;
+    expr.depth = path.size();
+    if (path.size()) {
+      expr.anc.assign(1, path.top());
+      for (long k = 1; 1L << k <= expr.depth; k++)
+        expr.anc.push_back(expr.anc[k-1]->anc[k-1]);
+    } else
+      expr.anc.assign(1, nullptr);
+    path.push(&expr);
+    DP(5, "%s(%ld-%ld)", expr.name().c_str(), expr.loc.start, expr.loc.end);
+  }
+  void post_expr(Expr& expr) {
+    path.pop();
+    expr.post = tick;
+#ifdef DEBUG
+    st.top().fsa.check();
+#endif
+  }
+
+  void visit(Expr& expr) override {
+    pre_expr(expr);
+    expr.accept(*this);
+    post_expr(expr);
+  }
+  void visit(BracketExpr& expr) override {
+    st.push(FsaAnno::bracket(expr));
+  }
+  void visit(CallExpr& expr) override {
+    st.push(FsaAnno::call(expr));
+  }
+  void visit(CollapseExpr& expr) override {
+    st.push(FsaAnno::collapse(expr));
+  }
+  void visit(ComplementExpr& expr) override {
+    visit(*expr.inner);
+    st.top().complement(&expr);
+  }
+  void visit(ConcatExpr& expr) override {
+    visit(*expr.rhs);
+    FsaAnno rhs = move(st.top());
+    visit(*expr.lhs);
+    st.top().concat(rhs, &expr);
+  }
+  void visit(DifferenceExpr& expr) override {
+    visit(*expr.rhs);
+    FsaAnno rhs = move(st.top());
+    visit(*expr.lhs);
+    st.top().difference(rhs, &expr);
+  }
+  void visit(DotExpr& expr) override {
+    st.push(FsaAnno::dot(&expr));
+  }
+  void visit(EmbedExpr& expr) override {
+    st.push(FsaAnno::embed(expr));
+  }
+  void visit(EpsilonExpr& expr) override {
+    st.push(FsaAnno::epsilon_fsa(&expr));
+  }
+  void visit(IntersectExpr& expr) override {
+    visit(*expr.rhs);
+    FsaAnno rhs = move(st.top());
+    visit(*expr.lhs);
+    st.top().intersect(rhs, &expr);
+  }
+  void visit(LiteralExpr& expr) override {
+    st.push(FsaAnno::literal(expr));
+  }
+  void visit(PlusExpr& expr) override {
+    visit(*expr.inner);
+    st.top().plus(&expr);
+  }
+  void visit(QuestionExpr& expr) override {
+    visit(*expr.inner);
+    st.top().question(&expr);
+  }
+  void visit(RepeatExpr& expr) override {
+    visit(*expr.inner);
+    st.top().repeat(expr);
+  }
+  void visit(StarExpr& expr) override {
+    visit(*expr.inner);
+    st.top().star(&expr);
+  }
+  void visit(UnionExpr& expr) override {
+    visit(*expr.rhs);
+    FsaAnno rhs = move(st.top());
+    visit(*expr.lhs);
+    st.top().union_(rhs, &expr);
+  }
+};
+
+void compile(DefineStmt* stmt)
+{
+  if (compiled.count(stmt))
+    return;
+  FsaAnno& anno = compiled[stmt];
+  Compiler comp;
+  comp.visit(*stmt->rhs);
+  anno = move(comp.st.top());
+  anno.determinize(NULL, NULL);
+  anno.minimize(NULL);
+  DP(4, "size(%s::%s) = %ld", stmt->module->filename.c_str(), stmt->lhs.c_str(), anno.fsa.n());
+}
+
+void generate_transitions(DefineStmt* stmt)
+{
+  FsaAnno& anno = compiled[stmt];
+  auto& call_addr = stmt2call_addr[stmt];
+  auto& sub_final = stmt2final[stmt];
+  auto find_within = [&](long u) {
+    vector<pair<Expr*, ExprTag>> within;
+    Expr* last = NULL;
+    sort(ALL(anno.assoc[u]), [](const pair<Expr*, ExprTag>& x, const pair<Expr*, ExprTag>& y) {
+      if (x.first->pre != y.first->pre)
+        return x.first->pre < y.first->pre;
+      return x.second < y.second;
+    });
+    for (auto aa: anno.assoc[u]) {
+      Expr* stop = last ? find_lca(last, aa.first) : NULL;
+      last = aa.first;
+      for (Expr* x = aa.first; x != stop; x = x->anc[0])
+        within.emplace_back(x, aa.second);
+    }
+    sort(ALL(within));
+    auto j = within.begin();
+    for (auto i = within.begin(); i != within.end(); ) {
+      Expr* x = i->first;
+      long t = long(i->second);
+      while (++i != within.end() && x == i->first)
+        t |= long(i->second);
+      *j++ = {x, ExprTag(t)};
+    }
+    within.erase(j, within.end());
+    return within;
+  };
+  decltype(anno.assoc) withins(anno.fsa.n());
+  REP(i, anno.fsa.n())
+    withins[i] = move(find_within(i));
+
+  auto get_code = [](Action* action) {
+    if (auto t = dynamic_cast<InlineAction*>(action))
+      return t->code;
+    else if (auto t = dynamic_cast<RefAction*>(action))
+      return t->define_stmt->code;
+    else
+      assert(0);
+    return string();
+  };
+
+#define D(S) if (opt_dump_action) { \
+               if (auto t = dynamic_cast<InlineAction*>(action.first)) { \
+                 if (from == to-1) \
+                   printf(S " %ld %ld %ld %s\n", u, from, v, t->code.c_str()); \
+                 else \
+                   printf(S " %ld %ld-%ld %ld %s\n", u, from, to-1, v, t->code.c_str()); \
+               } else if (auto t = dynamic_cast<RefAction*>(action.first)) { \
+                 if (from == to-1) \
+                   printf(S " %ld %ld %ld %s\n", u, from, v, t->define_stmt->code.c_str()); \
+                 else \
+                   printf(S " %ld %ld-%ld %ld %s\n", u, from, to-1, v, t->define_stmt->code.c_str()); \
+               } \
+             }
+
+  if (output_header) {
+    if (opt_gen_c) {
+      if (opt_gen_extern_c)
+        fputs("extern \"C\" ", output_header);
+      fprintf(output_header, "long yanshi_%s_transit(long* ret_stack, long* ret_stack_len, long u, long c", stmt->lhs.c_str());
+    }
+    else
+      fprintf(output_header, "long yanshi_%s_transit(vector<long>& ret_stack, long u, long c", stmt->lhs.c_str());
+    if (stmt->export_params.size())
+      fprintf(output_header, ", %s", stmt->export_params.c_str());
+    fprintf(output_header, ");\n");
+  }
+  if (opt_gen_c) {
+    if (opt_gen_extern_c)
+      fputs("extern \"C\" ", output);
+    fprintf(output, "long yanshi_%s_transit(long* ret_stack, long* ret_stack_len, long u, long c", stmt->lhs.c_str());
+  }
+  else
+    fprintf(output, "long yanshi_%s_transit(vector<long>& ret_stack, long u, long c", stmt->lhs.c_str());
+  if (stmt->export_params.size())
+    fprintf(output, ", %s", stmt->export_params.c_str());
+fprintf(output,
+")\n"
+"{\n"
+"  long v = -1;\n"
+"again:\n"
+"  switch (u) {\n");
+  REP(u, anno.fsa.n()) {
+    if (call_addr[u].first >= 0) { // no other transitions
+      fprintf(output,
+"  case %ld:\n"
+"    u = %ld;\n"
+, u, call_addr[u].first);
+      if (opt_gen_c)
+        fprintf(output,
+"    if (*ret_stack_len >= %ld) return -1;\n"
+"    ret_stack[(*ret_stack_len)++] = %ld;\n"
+, opt_max_return_stack, call_addr[u].second);
+      else
+        fprintf(output,
+"    ret_stack.push_back(%ld);\n"
+, call_addr[u].second);
+      fprintf(output,
+"    goto again;\n");
+      continue;
+    }
+    if (anno.fsa.adj[u].empty() && ! sub_final[u])
+      continue;
+    indent(output, 1);
+    fprintf(output, "case %ld:\n", u);
+    indent(output, 2);
+    fprintf(output, "switch (c) {\n");
+
+    unordered_map<long, pair<vector<pair<long, long>>, vector<pair<Action*, long>>>> v2case;
+    for (auto it = anno.fsa.adj[u].begin(); it != anno.fsa.adj[u].end(); ) {
+      long from = it->first.first, to = it->first.second, v = it->second;
+      while (++it != anno.fsa.adj[u].end() && to == it->first.first && it->second == v)
+        to = it->first.second;
+      v2case[v].first.emplace_back(from, to);
+      auto& body = v2case[v].second;
+
+      auto ie = withins[u].end(), je = withins[v].end();
+
+      // leaving = Expr(u) - Expr(v)
+      for (auto i = withins[u].begin(), j = withins[v].begin(); i != ie; ++i) {
+        while (j != je && i->first > j->first)
+          ++j;
+        if (j == je || i->first != j->first)
+          for (auto action: i->first->leaving) {
+            D("%%");
+            body.push_back(action);
+          }
+      }
+
+      // entering = Expr(v) - Expr(u)
+      for (auto i = withins[u].begin(), j = withins[v].begin(); j != je; ++j) {
+        while (i != ie && i->first < j->first)
+          ++i;
+        if (i == ie || i->first != j->first)
+          for (auto action: j->first->entering) {
+            D(">");
+            body.push_back(action);
+          }
+      }
+
+      // transiting = intersect(Expr(u), Expr(v))
+      for (auto i = withins[u].begin(), j = withins[v].begin(); j != je; ++j) {
+        while (i != ie && i->first < j->first)
+          ++i;
+        if (i != ie && i->first == j->first)
+          for (auto action: j->first->transiting) {
+            D("$");
+            body.push_back(action);
+          }
+      }
+
+      // finishing = intersect(Expr(u), Expr(v)) & Expr(v).has_final(v)
+      for (auto i = withins[u].begin(), j = withins[v].begin(); j != je; ++j) {
+        while (i != ie && i->first < j->first)
+          ++i;
+        if (i != ie && i->first == j->first && has_final(j->second))
+          for (auto action: j->first->finishing) {
+            D("@");
+            body.push_back(action);
+          }
+      }
+    }
+
+    for (auto& x: v2case) {
+      for (auto& y: x.second.first) {
+        indent(output, 2);
+        if (y.first == y.second-1)
+          fprintf(output, "case %ld:\n", y.first);
+        else
+          fprintf(output, "case %ld ... %ld:\n", y.first, y.second-1);
+      }
+      indent(output, 3);
+      fprintf(output, "v = %ld;\n", x.first);
+
+      // actions
+      sort(ALL(x.second.second), [](const pair<Action*, long>& a0, const pair<Action*, long>& a1) {
+        return a0.second != a1.second ? a0.second < a1.second : a0.first < a1.first;
+      });
+      x.second.second.erase(unique(ALL(x.second.second)), x.second.second.end());
+      for (auto a: x.second.second)
+        fprintf(output, "{%s}\n", get_code(a.first).c_str());
+      indent(output, 3);
+      fprintf(output, "break;\n");
+    }
+    // return from finals of DefineStmt called by CallExpr
+    if (sub_final[u]) {
+      indent(output, 2);
+      fprintf(output, "default:\n");
+      indent(output, 3);
+      fprintf(output, opt_gen_c ?
+"if (*ret_stack_len) { u = ret_stack[--*ret_stack_len]; goto again; }\n"
+:
+"if (ret_stack.size()) { u = ret_stack.back(); ret_stack.pop_back(); goto again; }\n");
+      indent(output, 3);
+      fprintf(output, "break;\n");
+    }
+
+    indent(output, 2);
+    fprintf(output, "}\n");
+    indent(output, 2);
+    fprintf(output, "break;\n");
+  }
+  indent(output, 1);
+  fprintf(output, "}\n");
+  indent(output, 1);
+  fprintf(output, "return v;\n");
+  fprintf(output, "}\n\n");
+}
+
+bool compile_export(DefineStmt* stmt)
+{
+  DP(2, "Exporting %s", stmt->lhs.c_str());
+  FsaAnno& anno = compiled[stmt];
+
+  DP(3, "Construct automaton with all DefineStmt associated to referenced CallExpr/CollapseExpr");
+  vector<vector<Edge>> adj;
+  decltype(anno.assoc) assoc;
+  vector<vector<DefineStmt*>> cllps;
+  long allo = 0;
+  unordered_map<DefineStmt*, long> stmt2offset;
+  unordered_map<DefineStmt*, long> stmt2start;
+  unordered_map<long, DefineStmt*> start2stmt;
+  vector<long> starts;
+  vector<bool> sub_final;
+  function<void(DefineStmt*)> allocate = [&](DefineStmt* stmt) {
+    if (stmt2offset.count(stmt))
+      return;
+    DP(4, "Allocate %ld to %s", allo, stmt->lhs.c_str());
+    FsaAnno& anno = compiled[stmt];
+    long base = stmt2offset[stmt] = allo;
+    allo += anno.fsa.n();
+    sub_final.resize(allo);
+    if (used_as_call.count(stmt)) {
+      stmt2start[stmt] = base+anno.fsa.start;
+      start2stmt[base+anno.fsa.start] = stmt;
+      starts.push_back(base+anno.fsa.start);
+      for (long f: anno.fsa.finals)
+        sub_final[base+f] = true;
+    }
+    adj.insert(adj.end(), ALL(anno.fsa.adj));
+    REP(i, anno.fsa.n())
+      for (auto& e: adj[base+i])
+        e.second += base;
+    assoc.insert(assoc.end(), ALL(anno.assoc));
+    FOR(i, base, base+anno.fsa.n()) {
+      for (auto aa: assoc[i])
+        if (has_start(aa.second)) {
+          if (auto* e = dynamic_cast<CallExpr*>(aa.first)) {
+            DefineStmt* v = e->define_stmt;
+            allocate(v);
+          } else if (auto* e = dynamic_cast<CollapseExpr*>(aa.first)) {
+            DefineStmt* v = e->define_stmt;
+            allocate(v);
+            // (i@{CollapseExpr,...}, special, _) -> ({CollapseExpr,...}, epsilon, CollapseExpr.define_stmt.start)
+            sorted_emplace(adj[i], epsilon, stmt2offset[v]+compiled[v].fsa.start);
+          }
+        }
+      long j = adj[i].size();
+      while (j && collapse_label_base < adj[i][j-1].first.second) {
+        long v = adj[i][j-1].second;
+        if (adj[i][j-1].first.first < collapse_label_base)
+          adj[i][j-1].first.second = collapse_label_base;
+        else
+          j--;
+        CollapseExpr* e;
+        for (auto aa: assoc[v])
+          if (has_final(aa.second) && (e = dynamic_cast<CollapseExpr*>(aa.first))) {
+            DefineStmt* w = e->define_stmt;
+            allocate(w);
+            // (_, special, v@{CollapseExpr,...}) -> (CollapseExpr.define_stmt.final, epsilon, v)
+            for (long f: compiled[w].fsa.finals) {
+              long g = stmt2offset[w]+f;
+              sorted_emplace(adj[g], epsilon, v);
+              if (g == i)
+                j++;
+            }
+          }
+      }
+      // remove (i, special, _)
+      adj[i].resize(j);
+    }
+  };
+  allocate(stmt);
+  anno.fsa.adj = move(adj);
+  anno.assoc = move(assoc);
+  anno.deterministic = false;
+  DP(3, "# of states: %ld", anno.fsa.n());
+
+  // substring grammar & this nonterminal is not marked as intact
+  if (opt_substring_grammar && ! stmt->intact) {
+    DP(3, "Constructing substring grammar");
+    anno.substring_grammar();
+    DP(3, "# of states: %ld", anno.fsa.n());
+  }
+
+  vector<vector<long>> map0;
+  DP(3, "Determinize");
+  anno.determinize(&starts, &map0);
+  vector<bool> sub_final2(anno.fsa.n());
+  REP(i, anno.fsa.n())
+    for (long u: map0[i]) {
+      if (sub_final[u])
+        sub_final2[i] = true;
+      if (start2stmt.count(u)) {
+        DefineStmt* stmt = start2stmt[u];
+        if (stmt2start[stmt] < 0) {
+          stmt->module->locfile.error_context(stmt->loc, "the start has been included in multiple DFA states");
+          return false;
+        }
+        stmt2start[stmt] = ~ i;
+      }
+    }
+  sub_final = move(sub_final2);
+  start2stmt.clear();
+  for (auto& it: stmt2start) {
+    it.second = ~ it.second;
+    start2stmt[it.second] = it.first;
+  }
+  DP(3, "# of states: %ld", anno.fsa.n());
+
+  DP(3, "Minimize");
+  map0.clear();
+  anno.minimize(&map0);
+  sub_final2.assign(anno.fsa.n(), false);
+  REP(i, anno.fsa.n())
+    for (long u: map0[i]) {
+      if (sub_final[u])
+        sub_final2[i] = true;
+      if (start2stmt.count(u)) {
+        DefineStmt* stmt = start2stmt[u];
+        stmt2start[stmt] = i;
+      }
+    }
+  sub_final = move(sub_final2);
+  start2stmt.clear();
+  for (auto& it: stmt2start)
+    start2stmt[it.second] = it.first;
+  DP(3, "# of states: %ld", anno.fsa.n());
+
+  if (! opt_keep_inaccessible) {
+    DP(3, "Keep accessible states");
+    // roots: start, starts of DefineStmt associated to CallExpr
+    starts.clear();
+    for (auto& it: stmt2start)
+      starts.push_back(it.second);
+    vector<long> map1;
+    anno.accessible(&starts, map1);
+    sub_final2.assign(anno.fsa.n(), false);
+    REP(i, anno.fsa.n()) {
+      long u = map1[i];
+      sub_final2[i] = sub_final[u];
+      if (start2stmt.count(u))
+        stmt2start[start2stmt[u]] = i;
+    }
+    sub_final = move(sub_final2);
+    start2stmt.clear();
+    for (auto& it: stmt2start)
+      start2stmt[it.second] = it.first;
+    DP(3, "# of states: %ld", anno.fsa.n());
+
+    DP(3, "Keep co-accessible states");
+    // roots: finals, finals of DefineStmt associated to CallExpr
+    map1.clear();
+    anno.co_accessible(&sub_final, map1);
+    sub_final2.assign(anno.fsa.n(), false);
+    REP(i, anno.fsa.n()) {
+      long u = map1[i];
+      sub_final2[i] = sub_final[u];
+      if (start2stmt.count(u))
+        stmt2start[start2stmt[u]] = i;
+    }
+    sub_final = move(sub_final2);
+    start2stmt.clear();
+    for (auto& it: stmt2start)
+      start2stmt[it.second] = it.first;
+    DP(3, "# of states: %ld", anno.fsa.n());
+  }
+
+  stmt2final[stmt] = sub_final;
+  auto& call_addr = stmt2call_addr[stmt];
+  call_addr.assign(anno.fsa.n(), make_pair(-1L, -1L));
+  DP(3, "CallExpr");
+  REP(i, anno.fsa.n())
+    if (anno.fsa.has_call(i)) {
+      if (anno.fsa.adj[i].size() != 1 || anno.fsa.adj[i][0].first.second-anno.fsa.adj[i][0].first.first > 1) {
+        stmt->module->locfile.error_context(stmt->loc, "state %ld: CallExpr cannot coexist with other transitions", i);
+        for (auto it = anno.fsa.adj[i].begin(); it != anno.fsa.adj[i].end(); ) {
+          long from = it->first.first, to = it->first.second, v = it->second;
+          while (++it != anno.fsa.adj[i].end() && to == it->first.first && it->second == v)
+            to = it->first.second;
+          fprintf(stderr, "  (%ld,%ld)\n", from, to-1);
+        }
+        return false;
+      }
+      for (auto aa: anno.assoc[i])
+        if (has_start(aa.second))
+          if (auto* e = dynamic_cast<CallExpr*>(aa.first)) // unique
+            call_addr[i] = {stmt2start[e->define_stmt], anno.fsa.adj[i][0].second};
+    }
+
+  DP(3, "Removing action/CallExpr labels");
+  REP(i, anno.fsa.n()) {
+    long j = anno.fsa.adj[i].size();
+    while (j && action_label_base < anno.fsa.adj[i][j-1].first.second)
+      if (anno.fsa.adj[i][j-1].first.first < action_label_base)
+        anno.fsa.adj[i][j-1].first.second = action_label_base;
+      else
+        j--;
+    anno.fsa.adj[i].resize(j);
+  }
+
+  return true;
+}
+
+//// Graphviz dot renderer
+
+void generate_graphviz(Module* mo)
+{
+  fprintf(output, "// Generated by 偃师, %s\n", mo->filename.c_str());
+  for (Stmt* x = mo->toplevel; x; x = x->next)
+    if (auto stmt = dynamic_cast<DefineStmt*>(x)) {
+      if (stmt->export_) {
+        FsaAnno& anno = compiled[stmt];
+
+        fprintf(output, "digraph \"%s\" {\n", mo->filename.c_str());
+        bool start_is_final = false;
+
+        // finals
+        indent(output, 1);
+        fprintf(output, "node[shape=doublecircle,color=olivedrab1,style=filled,fontname=Monospace];");
+        for (long f: anno.fsa.finals)
+          if (f == anno.fsa.start)
+            start_is_final = true;
+          else
+            fprintf(output, " %ld", f);
+        fprintf(output, "\n");
+
+        // start
+        indent(output, 1);
+        if (start_is_final)
+          fprintf(output, "node[shape=doublecircle,color=orchid];");
+        else
+          fprintf(output, "node[shape=circle,color=orchid];");
+        fprintf(output, " %ld\n", anno.fsa.start);
+
+        // other states
+        indent(output, 1);
+        fprintf(output, "node[shape=circle,color=black,style=\"\"]\n");
+
+        // edges
+        REP(u, anno.fsa.n()) {
+          unordered_map<long, stringstream> labels;
+          bool first = true;
+          auto it = anno.fsa.adj[u].begin();
+          for (; it != anno.fsa.adj[u].end(); ++it) {
+            stringstream& lb = labels[it->second];
+            if (! lb.str().empty())
+              lb << ',';
+            if (it->first.first == it->first.second-1)
+              lb << it->first.first;
+            else
+              lb << it->first.first << '-' << it->first.second-1;
+          }
+          for (auto& lb: labels) {
+            indent(output, 1);
+            fprintf(output, "%ld -> %ld[label=\"%s\"]\n", u, lb.first, lb.second.str().c_str());
+          }
+        }
+      }
+    }
+  fprintf(output, "}\n");
+}
+
+//// C++ renderer
+
+static void generate_final(const char* name, const vector<bool>& final)
+{
+  // comment
+  fprintf(output, "  //static const long %sfinals[] = {", name);
+  bool first = true;
+  REP(i, final.size())
+    if (final[i]) {
+      if (first) first = false;
+      else fprintf(output, ",");
+      fprintf(output, "%ld", i);
+    }
+  fprintf(output, "};\n");
+
+  first = true;
+  fprintf(output, "  static const unsigned long %sfinal[] = {", name);
+  for (long j = 0, i = 0; i < final.size(); i += CHAR_BIT*sizeof(long)) {
+      ulong mask = 0;
+      for (; j < final.size() && j < i+CHAR_BIT*sizeof(long); j++)
+        if (final[j]) {
+          mask |= 1uL << (j-i);
+        }
+      if (i) fprintf(output, ",");
+      fprintf(output, "%#lx", mask);
+  }
+  fprintf(output, "};\n");
+}
+
+static void generate_cxx_export(DefineStmt* stmt)
+{
+  FsaAnno& anno = compiled[stmt];
+
+  // yanshi_%s_init
+  if (output_header)
+    fprintf(output_header, "extern long yanshi_%s_start;\n", stmt->lhs.c_str());
+  fprintf(output, "long yanshi_%s_start = %ld;\n\n", stmt->lhs.c_str(), anno.fsa.start);
+
+  // yanshi_%s_is_final
+  if (output_header) {
+    if (opt_gen_extern_c) fputs("extern \"C\" ", output_header);
+    fprintf(output_header, opt_gen_c ?
+"bool yanshi_%s_is_final(const long* ret_stack, long ret_stack_len, long u);\n"
+:
+"bool yanshi_%s_is_final(const vector<long>& ret_stack, long u);\n"
+, stmt->lhs.c_str());
+  }
+  if (opt_gen_extern_c) fputs("extern \"C\" ", output);
+  fprintf(output, opt_gen_c ?
+"bool yanshi_%s_is_final(const long* ret_stack, long ret_stack_len, long u)\n"
+:
+"bool yanshi_%s_is_final(const vector<long>& ret_stack, long u)\n"
+          , stmt->lhs.c_str());
+  fprintf(output, "{\n");
+  vector<bool> final(anno.fsa.n());
+  for (long f: anno.fsa.finals)
+    final[f] = true;
+  generate_final("", final);
+  generate_final("sub_", stmt2final[stmt]);
+  fprintf(output, opt_gen_c ?
+"  for (long i = ret_stack_len; i; u = ret_stack[--i])\n"
+:
+"  for (auto i = ret_stack.size(); i; u = ret_stack[--i])\n"
+);
+  fprintf(output,
+"    if (! (0 <= u && u < %ld && sub_final[u/(CHAR_BIT*sizeof(long))] >> (u%%(CHAR_BIT*sizeof(long))) & 1))\n"
+"      return false;\n"
+"  return 0 <= u && u < %ld && final[u/(CHAR_BIT*sizeof(long))] >> (u%%(CHAR_BIT*sizeof(long))) & 1;\n"
+"};\n\n"
+, anno.fsa.n() , anno.fsa.n()
+);
+  generate_transitions(stmt);
+}
+
+void generate_cxx(Module* mo)
+{
+  fprintf(output, "// Generated by 偃师, %s\n", mo->filename.c_str());
+  fprintf(output, "#include <limits.h>\n");
+  if (! opt_gen_c) {
+    fprintf(output, "#include <vector>\n");
+    fprintf(output, "using namespace std;\n");
+  } else {
+    fprintf(output, "#include <stdbool.h>\n");
+  }
+  if (opt_standalone) {
+    fputs(
+"#include <algorithm>\n"
+"#include <cinttypes>\n"
+"#include <clocale>\n"
+"#include <codecvt>\n"
+"#include <cstdint>\n"
+"#include <cstdio>\n"
+"#include <cstring>\n"
+"#include <cwctype>\n"
+"#include <iostream>\n"
+"#include <locale>\n"
+"#include <string>\n"
+"using namespace std;\n"
+, output);
+  }
+  if (output_header) {
+    fputs("#pragma once\n", output_header);
+    if (! opt_gen_c) {
+      fprintf(output_header, "#include <vector>\n");
+      fprintf(output_header, "using std::vector;\n");
+    } else {
+      fprintf(output_header, "#include <stdbool.h>\n");
+    }
+  }
+  fprintf(output, "\n");
+  DefineStmt* main_export = NULL;
+  for (Stmt* x = mo->toplevel; x; x = x->next)
+    if (auto xx = dynamic_cast<DefineStmt*>(x)) {
+      if (xx->export_) {
+        if (! main_export)
+          main_export = xx;
+        generate_cxx_export(xx);
+      }
+    } else if (auto xx = dynamic_cast<CppStmt*>(x))
+      fprintf(output, "%s", xx->code.c_str());
+  if (opt_standalone && main_export) {
+    fprintf(output,
+"\n"
+"int main(int argc, char* argv[])\n"
+"{\n"
+"  setlocale(LC_ALL, \"\");\n"
+"  string utf8;\n"
+"  const char* p;\n"
+"  long c, u = yanshi_%s_start, pref = 0;\n"
+, main_export->lhs.c_str());
+    if (opt_gen_c)
+      fprintf(output, "  long ret_stack[%ld], ret_stack_len = 0;\n", opt_max_return_stack);
+    else
+      fprintf(output, "  vector<long> ret_stack;\n");
+    fprintf(output,
+"  if (argc == 2)\n"
+"    utf8 = argv[1];\n"
+"  else {\n"
+"    FILE* f = argc == 1 ? stdin : fopen(argv[1], \"r\");\n"
+"    while ((c = fgetc(f)) != EOF)\n"
+"      utf8 += c;\n"
+"    fclose(f);\n"
+"  }\n"
+"  u32string utf32 = wstring_convert<codecvt_utf8<char32_t>, char32_t>{}.from_bytes(utf8);\n"
+);
+    fprintf(output, opt_gen_c ?
+"  printf(\"\\033[%%s33m%%ld \\033[m\", yanshi_%s_is_final(ret_stack, ret_stack_len, u) ? \"1;\" : \"\", u);\n"
+:
+"  printf(\"\\033[%%s33m%%ld \\033[m\", yanshi_%s_is_final(ret_stack, u) ? \"1;\" : \"\", u);\n"
+, main_export->lhs.c_str()
+);
+    fprintf(output,
+"  for (char32_t c: utf32) {\n");
+    fprintf(output, opt_gen_c ?
+"    u = yanshi_%s_transit(ret_stack, &ret_stack_len, u, c);\n"
+:
+"    u = yanshi_%s_transit(ret_stack, u, c);\n"
+, main_export->lhs.c_str());
+    fprintf(output,
+"    if (c > WCHAR_MAX || iswcntrl(c)) printf(\"%%\" PRIuLEAST32 \" \", c);\n"
+"    else cout << wstring_convert<codecvt_utf8<char32_t>, char32_t>{}.to_bytes(c) << ' ';\n");
+    fprintf(output, opt_gen_c ?
+"    printf(\"\\033[%%s33m%%ld \\033[m\", yanshi_%s_is_final(ret_stack, ret_stack_len, u) ? \"1;\" : \"\", u);\n"
+:
+"    printf(\"\\033[%%s33m%%ld \\033[m\", yanshi_%s_is_final(ret_stack, u) ? \"1;\" : \"\", u);\n"
+, main_export->lhs.c_str());
+    fprintf(output,
+"    if (u < 0) break;\n"
+"    pref++;\n"
+"  }\n");
+    fprintf(output, opt_gen_c ?
+"  printf(\"\\nlen: %%zd\\npref: %%ld\\nstate: %%ld\\nfinal: %%s\\n\", utf32.size(), pref, u, yanshi_%s_is_final(ret_stack, ret_stack_len, u) ? \"true\" : \"false\");\n"
+"}\n"
+:
+"  printf(\"\\nlen: %%zd\\npref: %%ld\\nstate: %%ld\\nfinal: %%s\\n\", utf32.size(), pref, u, yanshi_%s_is_final(ret_stack, u) ? \"true\" : \"false\");\n"
+"}\n"
+, main_export->lhs.c_str());
+  }
+}

+ 14 - 0
yanshi/src/compiler.hh

@@ -0,0 +1,14 @@
+#pragma once
+#include "fsa_anno.hh"
+#include "syntax.hh"
+
+#include <unordered_map>
+using std::unordered_map;
+
+void print_assoc(const FsaAnno& anno);
+void print_automaton(const Fsa& fsa);
+void compile(DefineStmt*);
+bool compile_export(DefineStmt* stmt);
+void generate_cxx(Module* mo);
+void generate_graphviz(Module* mo);
+extern unordered_map<DefineStmt*, FsaAnno> compiled;

+ 571 - 0
yanshi/src/fsa.cc

@@ -0,0 +1,571 @@
+#include "common.hh"
+#include "fsa.hh"
+#include "option.hh"
+
+#include <algorithm>
+#include <assert.h>
+#include <limits.h>
+#include <queue>
+#include <set>
+#include <stack>
+#include <tuple>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+using namespace std;
+
+namespace std
+{
+  template<typename T>
+  struct hash<vector<T>> {
+    size_t operator()(const vector<T>& v) const {
+      hash<T> h;
+      size_t r = 0;
+      for (auto x: v)
+        r = r*17+h(x);
+      return r;
+    }
+  };
+}
+
+void Fsa::check() const
+{
+  REP(i, n())
+    FOR(j, 1, adj[i].size())
+      assert(adj[i][j-1].first.second == 0 &&
+             adj[i][j].first.second == 0 ||
+             adj[i][j-1].first.second <= adj[i][j].first.first);
+}
+
+bool Fsa::has(long u, long c) const
+{
+  auto it = upper_bound(ALL(adj[u]), make_pair(make_pair(c, LONG_MAX), LONG_MAX));
+  return it != adj[u].begin() && c < (--it)->first.second;
+}
+
+bool Fsa::has_call(long u) const
+{
+  auto it = upper_bound(ALL(adj[u]), make_pair(make_pair(call_label_base, LONG_MAX), LONG_MAX));
+  return (it != adj[u].end() && it->first.first < call_label) || (it != adj[u].begin() && call_label_base < (--it)->first.second);
+}
+
+bool Fsa::has_call_or_collapse(long u) const
+{
+  auto it = upper_bound(ALL(adj[u]), make_pair(make_pair(call_label_base, LONG_MAX), LONG_MAX));
+  return it != adj[u].end() || (it != adj[u].begin() && call_label_base < (--it)->first.second);
+}
+
+long Fsa::transit(long u, long c) const
+{
+  auto it = upper_bound(ALL(adj[u]), make_pair(make_pair(c, LONG_MAX), LONG_MAX));
+  return it != adj[u].begin() && c < (--it)->first.second ? it->second : -1;
+}
+
+bool Fsa::is_final(long x) const
+{
+  return binary_search(ALL(finals), x);
+}
+
+void Fsa::epsilon_closure(vector<long>& src) const
+{
+  static vector<bool> vis;
+  if (n() > vis.size())
+    vis.resize(n());
+  for (long i: src)
+    vis[i] = true;
+  REP(i, src.size()) {
+    long u = src[i];
+    for (auto& e: adj[u]) {
+      if (-1 < e.first.first) break;
+      if (! vis[e.second]) {
+        vis[e.second] = true;
+        src.push_back(e.second);
+      }
+    }
+  }
+  for (long i: src)
+    vis[i] = false;
+  sort(ALL(src));
+}
+
+Fsa Fsa::operator~() const
+{
+  long accept = n();
+  Fsa r;
+  r.start = start;
+  r.adj.resize(accept+1);
+  REP(i, accept) {
+    long j = 0;
+    for (auto& e: adj[i]) {
+      if (j < e.first.first)
+        r.adj[i].emplace_back(make_pair(j, e.first.first), accept);
+      r.adj[i].emplace_back(e.first, e.second);
+      j = e.first.second;
+    }
+    if (j < AB)
+      r.adj[i].emplace_back(make_pair(j, AB), accept);
+  }
+  r.adj[accept].emplace_back(make_pair(0, AB), accept);
+  vector<long> new_finals;
+  auto j = finals.begin();
+  REP(i, accept+1) {
+    while (j != finals.end() && *j < i)
+      ++j;
+    if (j == finals.end() || *j != i)
+      new_finals.push_back(i);
+  }
+  r.finals = move(new_finals);
+  return r;
+}
+
+void Fsa::accessible(const vector<long>* starts, function<void(long)> relate)
+{
+  vector<long> q{start}, id(n(), 0);
+  id[start] = 1;
+  if (starts)
+    for (long u: *starts)
+      if (! id[u]) {
+        id[u] = 1;
+        q.push_back(u);
+      }
+  REP(i, q.size()) {
+    long u = q[i];
+    for (auto& e: adj[u]) {
+      //if (e.first.first >= AB) break;
+      if (! id[e.second]) {
+        id[e.second] = 1;
+        q.push_back(e.second);
+      }
+    }
+  }
+
+  long j = 0;
+  REP(i, n())
+    id[i] = id[i] ? j++ : -1;
+
+  auto it = finals.begin(), it2 = it;
+  REP(i, n())
+    if (id[i] >= 0) {
+      relate(i);
+      if (start == i)
+        start = id[i];
+      while (it != finals.end() && *it < i)
+        ++it;
+      if (it != finals.end() && *it == i)
+        *it2++ = id[i];
+      long k = 0;
+      for (auto& e: adj[i])
+        if (id[e.second] >= 0)
+          adj[i][k++] = {e.first, id[e.second]}; // unordered unless deterministic
+      adj[i].resize(k);
+      if (id[i] != i)
+        adj[id[i]] = move(adj[i]);
+    }
+  finals.erase(it2, finals.end());
+  adj.resize(j);
+}
+
+void Fsa::co_accessible(const vector<bool>*final, function<void(long)> relate)
+{
+  vector<vector<long>> radj(n());
+  REP(i, n())
+    for (auto& e: adj[i]) {
+      //if (e.first.first >= AB) break;
+      radj[e.second].push_back(i);
+    }
+  REP(i, n())
+    sort(ALL(radj[i]));
+  vector<long> q = finals, id(n(), 0);
+  for (long f: finals)
+    id[f] = 1;
+  if (final)
+    REP(i, n())
+      if ((*final)[i] && ! id[i]) {
+        id[i] = 1;
+        q.push_back(i);
+      }
+  REP(i, q.size()) {
+    long u = q[i];
+    for (auto& v: radj[u])
+      if (! id[v]) {
+        id[v] = 1;
+        q.push_back(v);
+      }
+  }
+  if (! id[start]) {
+    start = 0;
+    finals.clear();
+    adj.assign(1, {});
+    return;
+  }
+
+  long j = 0;
+  REP(i, n())
+    id[i] = id[i] ? j++ : -1;
+
+  auto it = finals.begin(), it2 = it;
+  REP(i, n())
+    if (id[i] >= 0) {
+      relate(i);
+      if (start == i)
+        start = id[i];
+      while (it != finals.end() && *it < i)
+        ++it;
+      if (it != finals.end() && *it == i)
+        *it2++ = id[i];
+      long k = 0;
+      for (auto& e: adj[i])
+        if (id[e.second] >= 0)
+          adj[i][k++] = {e.first, id[e.second]}; // unordered unless deterministic
+      adj[i].resize(k);
+      if (id[i] != i)
+        adj[id[i]] = move(adj[i]);
+    }
+  finals.erase(it2, finals.end());
+  adj.resize(j);
+}
+
+Fsa Fsa::difference(const Fsa& rhs, function<void(long)> relate) const
+{
+  Fsa r;
+  vector<pair<long, long>> q;
+  unordered_map<long, long> m;
+  q.emplace_back(start, rhs.start);
+  m[(rhs.n()+1) * start + rhs.start] = 0;
+  r.start = 0;
+  REP(i, q.size()) {
+    long u0, u1, v0, v1;
+    tie(u0, u1) = q[i];
+    if (is_final(u0) && ! rhs.is_final(u1))
+      r.finals.push_back(i);
+    r.adj.emplace_back();
+    relate(u0);
+    vector<Edge>::const_iterator it0 = adj[u0].begin(), it1, it1e;
+    if (u1 == rhs.n())
+      it1 = it1e = rhs.adj[0].end();
+    else {
+      it1 = rhs.adj[u1].begin();
+      it1e = rhs.adj[u1].end();
+    }
+    long last = LONG_MIN;
+    while (it0 != adj[u0].end()) {
+      long from = max(last, it0->first.first), to = it0->first.second;
+      while (it1 != it1e && it1->first.second <= from)
+        ++it1;
+      if (it1 != it1e)
+        to = min(to, from < it1->first.first ? it1->first.first : it1->first.second);
+      last = to;
+      long v1 = it1 != it1e && it1->first.first <= from ? it1->second : rhs.n(),
+           t = (rhs.n()+1) * it0->second + v1;
+      auto mit = m.find(t);
+      if (mit == m.end()) {
+        mit = m.emplace(t, m.size()).first;
+        q.emplace_back(it0->second, v1);
+      }
+      r.adj[i].emplace_back(make_pair(from, to), mit->second);
+      if (to == it0->first.second)
+        ++it0;
+    }
+  }
+  return r;
+}
+
+Fsa Fsa::intersect(const Fsa& rhs, function<void(long, long)> relate) const
+{
+  Fsa r;
+  vector<pair<long, long>> q;
+  long u0, u1, v0, v1;
+  unordered_map<long, long> m;
+  q.emplace_back(start, rhs.start);
+  m[rhs.n() * start + rhs.start] = 0;
+  r.start = 0;
+  REP(i, q.size()) {
+    tie(u0, u1) = q[i];
+    if (is_final(u0) && rhs.is_final(u1))
+      r.finals.push_back(i);
+    r.adj.emplace_back();
+    relate(u0, u1);
+    auto it0 = adj[u0].begin(), it1 = rhs.adj[u1].begin();
+    while (it0 != adj[u0].end() && it1 != rhs.adj[u1].end()) {
+      if (it0->first.second <= it1->first.first)
+        ++it0;
+      else if (it1->first.second <= it0->first.first)
+        ++it1;
+      else {
+        long t = rhs.n() * it0->second + it1->second;
+        auto mit = m.find(t);
+        if (mit == m.end()) {
+          mit = m.emplace(t, m.size()).first;
+          q.emplace_back(it0->second, it1->second);
+        }
+        r.adj[i].emplace_back(make_pair(max(it0->first.first, it1->first.first), min(it0->first.second, it1->first.second)), mit->second);
+        if (it0->first.second < it1->first.second)
+          ++it0;
+        else if (it0->first.second > it1->first.second)
+          ++it1;
+        else
+          ++it0, ++it1;
+      }
+    }
+  }
+  return r;
+}
+
+Fsa Fsa::determinize(const vector<long>* starts, function<void(long, const vector<long>&)> relate) const
+{
+  Fsa r;
+  r.start = 0;
+  unordered_map<vector<long>, long> m;
+  vector<vector<Edge>::const_iterator> its(n());
+  vector<long> vs{start};
+  vector<pair<long, long>> events;
+  stack<vector<long>> st;
+  epsilon_closure(vs);
+  m[vs] = 0;
+  st.push(move(vs));
+  if (starts)
+    for (long u: *starts) {
+      vs.assign(1, u);
+      epsilon_closure(vs);
+      if (! m.count(vs)) {
+        m.emplace(vs, m.size());
+        st.push(move(vs));
+      }
+    }
+  while (st.size()) {
+    vector<long> x = move(st.top());
+    st.pop();
+    long id = m[x];
+    if (id+1 > r.adj.size())
+      r.adj.resize(id+1);
+    relate(id, x);
+    bool final = false;
+    events.clear();
+    for (long u: x) {
+      if (is_final(u))
+        final = true;
+      for (auto& e: adj[u]) {
+        events.emplace_back(e.first.first, e.second);
+        events.emplace_back(e.first.second, ~ e.second);
+      }
+    }
+    if (final)
+      r.finals.push_back(id);
+    long last = 0;
+    multiset<long> live;
+    sort(ALL(events));
+    for (auto& ev: events) {
+      if (last < ev.first) {
+        if (live.size()) {
+          vs.assign(ALL(live));
+          vs.erase(unique(ALL(vs)), vs.end());
+          epsilon_closure(vs);
+          auto mit = m.find(vs);
+          if (mit == m.end()) {
+            mit = m.emplace(vs, m.size()).first;
+            st.push(vs);
+          }
+          if (r.adj[id].size() && r.adj[id].back().first.second == last && r.adj[id].back().second == mit->second) // coalesce two edges
+            r.adj[id].back().first.second = ev.first;
+          else
+            r.adj[id].emplace_back(make_pair(last, ev.first), mit->second);
+        }
+        last = ev.first;
+      }
+      if (ev.second >= 0)
+        live.insert(ev.second);
+      else
+        live.erase(live.find(~ ev.second));
+    }
+  }
+  sort(ALL(r.finals));
+  return r;
+}
+
+Fsa Fsa::distinguish(function<void(vector<long>&)> relate) const
+{
+  vector<long> scale;
+  REP(i, n())
+    for (auto& e: adj[i]) {
+      scale.push_back(e.first.first);
+      scale.push_back(e.first.second);
+    }
+  sort(ALL(scale));
+  scale.erase(unique(ALL(scale)), scale.end());
+
+  vector<vector<pair<long, long>>> radj(n());
+  REP(i, n())
+    for (auto& e: adj[i]) {
+      long from = lower_bound(ALL(scale), e.first.first) - scale.begin(),
+           to = lower_bound(ALL(scale), e.first.second) - scale.begin();
+      FOR(j, from, to)
+        radj[e.second].emplace_back(j, i);
+    }
+  REP(i, n())
+    sort(ALL(radj[i]));
+  vector<long> L(n()), R(n()), B(n()), C(n(), 0), CC(n(), 0);
+  vector<bool> mark(n(), false);
+
+  // distinguish finals & non-finals
+  long fx = -1, x = -1, fy = -1, y = -1, j = 0;
+  REP(i, n())
+    if (j < finals.size() && finals[j] == i) {
+      j++;
+      if (y < 0)
+        fy = i;
+      else
+        R[y] = i;
+      C[B[i] = fy]++;
+      L[i] = y;
+      y = i;
+    } else {
+      if (x < 0)
+        fx = i;
+      else
+        R[x] = i;
+      C[B[i] = fx]++;
+      L[i] = x;
+      x = i;
+    }
+  if (x >= 0)
+    L[fx] = x, R[x] = fx;
+  if (y >= 0)
+    L[fy] = y, R[y] = fy;
+
+  set<pair<long, long>> refines;
+  auto labels = [&](long fx) {
+    vector<long> lb;
+    for (long x = fx; ; ) {
+      for (auto& e: radj[x])
+        lb.push_back(e.first);
+      if ((x = R[x]) == fx) break;
+    }
+    sort(ALL(lb));
+    lb.erase(unique(ALL(lb)), lb.end());
+    return lb;
+  };
+
+  if (fx >= 0)
+    for (long a: labels(fx))
+      refines.emplace(a, fx);
+  if (fy >= 0)
+    for (long a: labels(fy))
+      refines.emplace(a, fy);
+  while (refines.size()) {
+    long a;
+    tie(a, fx) = *refines.begin();
+    refines.erase(refines.begin());
+    // count
+    vector<long> bs;
+    for (x = fx; ; ) {
+      auto it = lower_bound(ALL(radj[x]), make_pair(a, 0L)),
+           ite = upper_bound(ALL(radj[x]), make_pair(a, n()));
+      for (; it != ite; ++it) {
+        y = it->second;
+        if (! CC[B[y]]++)
+          bs.push_back(B[y]);
+        mark[y] = true;
+      }
+      if ((x = R[x]) == fx) break;
+    }
+    // for each refinable set
+    for (long fy: bs) {
+      if (CC[fy] < C[fy]) {
+        long fu = -1, u = -1, cu = 0,
+             fv = -1, v = -1, cv = 0;
+        vector<long> lb = labels(fy);
+        for (long i = fy; ; ) {
+          if (mark[i]) {
+            mark[i] = false;
+            if (u < 0)
+              C[fu = i] = 0;
+            else
+              R[u] = i;
+            C[fu]++;
+            B[i] = fu;
+            L[i] = u;
+            u = i;
+          } else {
+            if (v < 0)
+              C[fv = i] = 0;
+            else
+              R[v] = i;
+            C[fv]++;
+            B[i] = fv;
+            L[i] = v;
+            v = i;
+          }
+          if ((i = R[i]) == fy) break;
+        }
+        L[fu] = u, R[u] = fu;
+        L[fv] = v, R[v] = fv;
+        //REP(a, AB+1)
+        for (long a: lb)
+          if (refines.count({a, fy}))
+            refines.emplace(a, fu != fy ? fu : fv);
+          else
+            refines.emplace(a, C[fu] < C[fv] ? fu : fv);
+      } else
+        for (long i = fy; ; ) {
+          mark[i] = false;
+          if ((i = R[i]) == fy) break;
+        }
+      CC[fy] = 0;
+    }
+    // clear marks
+    for (x = fx; ; ) {
+      auto it = lower_bound(ALL(radj[x]), make_pair(a, 0L)),
+           ite = upper_bound(ALL(radj[x]), make_pair(a, n()));
+      for (; it != ite; ++it) {
+        y = it->second;
+        CC[B[y]] = 0;
+        mark[y] = false;
+      }
+      if ((x = R[x]) == fx) break;
+    }
+  }
+
+  Fsa r;
+  long nn = 0;
+  vector<long> vs;
+  REP(i, n())
+    if (B[i] == i) {
+      vs.clear();
+      for (long j = i; ; ) {
+        B[j] = nn;
+        vs.push_back(j);
+        if ((j = R[j]) == i) break;
+      }
+      relate(vs);
+      if (binary_search(ALL(finals), i))
+        r.finals.push_back(nn);
+      nn++;
+    }
+  r.start = B[start];
+  r.adj.resize(nn);
+  REP(i, n())
+    for (auto& e: adj[i])
+      r.adj[B[i]].emplace_back(e.first, B[e.second]);
+  REP(i, nn) {
+    // merge edges with the same destination
+    sort(ALL(r.adj[i]), [](const Edge& x, const Edge& y) {
+      return x.second != y.second ? x.second < y.second : x.first < y.first;
+    });
+    auto it2 = r.adj[i].begin();
+    for (auto it = r.adj[i].begin(); it != r.adj[i].end(); ) {
+      long v = it->second, from = it->first.first, to = it->first.second;
+      while (++it != r.adj[i].end() && it->second == v)
+        if (it->first.first <= to)
+          to = max(to, it->first.second);
+        else {
+          *it2++ = make_pair(make_pair(from, to), v);
+          tie(from, to) = it->first;
+        }
+      *it2++ = make_pair(make_pair(from, to), v);
+    }
+    r.adj[i].erase(it2, r.adj[i].end());
+    sort(ALL(r.adj[i]));
+  }
+  return r;
+}

+ 40 - 0
yanshi/src/fsa.hh

@@ -0,0 +1,40 @@
+#pragma once
+#include <functional>
+#include <utility>
+#include <vector>
+using std::function;
+using std::pair;
+using std::vector;
+
+typedef pair<long, long> Label;
+typedef pair<Label, long> Edge;
+
+const Label epsilon{-1L, 0L};
+
+struct Fsa {
+  long start;
+  vector<long> finals; // sorted
+  vector<vector<Edge>> adj; // sorted
+
+  void check() const;
+  long n() const { return adj.size(); }
+  bool is_final(long x) const;
+  bool has(long u, long c) const;
+  bool has_call(long u) const;
+  bool has_call_or_collapse(long u) const;
+  long transit(long u, long c) const;
+  void epsilon_closure(vector<long>& src) const;
+  Fsa operator~() const;
+  // a -> a
+  void accessible(const vector<long>* starts, function<void(long)> relate);
+  // a -> a
+  void co_accessible(const vector<bool>* final, function<void(long)> relate);
+  // DFA -> DFA -> DFA
+  Fsa intersect(const Fsa& rhs, function<void(long, long)> relate) const;
+  // DFA -> DFA -> DFA
+  Fsa difference(const Fsa& rhs, function<void(long)> relate) const;
+  // DFA -> DFA
+  Fsa distinguish(function<void(vector<long>&)> relate) const;
+  // * -> DFA
+  Fsa determinize(const vector<long>* starts, function<void(long, const vector<long>&)> relate) const;
+};

+ 476 - 0
yanshi/src/fsa_anno.cc

@@ -0,0 +1,476 @@
+#include "common.hh"
+#include "compiler.hh"
+#include "fsa_anno.hh"
+#include "loader.hh"
+#include "option.hh"
+
+#include <algorithm>
+#include <limits.h>
+#include <map>
+#include <unicode/utf8.h>
+#include <utility>
+using namespace std;
+
+bool operator<(ExprTag x, ExprTag y)
+{
+  return long(x) < long(y);
+}
+
+bool assoc_has_expr(vector<pair<Expr*, ExprTag>>& as, Expr* x)
+{
+  auto it = lower_bound(ALL(as), make_pair(x, ExprTag(0)));
+  return it != as.end() && it->first == x;
+}
+
+void sort_assoc(vector<pair<Expr*, ExprTag>>& as)
+{
+  sort(ALL(as));
+  auto i = as.begin(), j = i, k = i;
+  for (; i != as.end(); i = j) {
+    while (++j != as.end() && i->first == j->first)
+      i->second = ExprTag(long(i->second) | long(j->second));
+    *k++ = *i;
+  }
+  as.erase(k, as.end());
+}
+
+void FsaAnno::add_assoc(Expr& expr)
+{
+  // has actions: actions need tags to differentiate 'entering', 'leaving', ...
+  // 'intact': states with the 'inner' tag cannot be connected to start/final in substring grammar
+  // 'CallExpr' 'CollapseExpr': differentiate states representing 'CallExpr' 'CollapseExpr' (u, special, v)
+  // 'opt_mode': displaying possible positions for given strings in interactive mode
+  if (expr.no_action() && ! expr.stmt->intact && ! dynamic_cast<CallExpr*>(&expr) && ! dynamic_cast<CollapseExpr*>(&expr) && opt_mode != Mode::interactive)
+    return;
+  auto j = fsa.finals.begin();
+  REP(i, fsa.n()) {
+    ExprTag tag = ExprTag(0);
+    if (i == fsa.start)
+      tag = ExprTag::start;
+    while (j != fsa.finals.end() && *j < i)
+      ++j;
+    if (j != fsa.finals.end() && *j == i)
+      tag = ExprTag(long(tag) | long(ExprTag::final));
+    if (tag == ExprTag(0))
+      tag = ExprTag::inner;
+    sorted_insert(assoc[i], make_pair(&expr, tag));
+  }
+  // Add pseudo transitions with labels [ACTION_LABEL_BASE, COLLAPSE_LABEL_BASE) to prevent its merge with other states
+  if (expr.leaving.size() || expr.entering.size() || expr.transiting.size())
+    for (auto action: expr.transiting)
+      REP(i, fsa.n()) {
+        fsa.adj[i].emplace_back(make_pair(action_label, action_label+1), i);
+        action_label++;
+      }
+  else if (expr.finishing.size())
+    for (long f: fsa.finals) {
+      fsa.adj[f].emplace_back(make_pair(action_label, action_label+1), f);
+      action_label++;
+    }
+}
+
+void FsaAnno::accessible(const vector<long>* starts, vector<long>& mapping) {
+  long allo = 0;
+  auto relate = [&](long x) {
+    if (allo != x)
+      assoc[allo] = move(assoc[x]);
+    allo++;
+    mapping.push_back(x);
+  };
+  fsa.accessible(starts, relate);
+  assoc.resize(allo);
+}
+
+void FsaAnno::co_accessible(const vector<bool>* final, vector<long>& mapping) {
+  long allo = 0;
+  auto relate = [&](long x) {
+    if (allo != x)
+      assoc[allo] = move(assoc[x]);
+    allo++;
+    mapping.push_back(x);
+  };
+  fsa.co_accessible(final, relate);
+  if (fsa.finals.empty()) { // 'start' does not produce acceptable strings
+    assoc.assign(1, {});
+    mapping.assign(1, 0);
+    deterministic = true;
+    return;
+  }
+  if (! deterministic)
+    REP(i, fsa.n())
+      sort(ALL(fsa.adj[i]));
+  assoc.resize(allo);
+}
+
+void FsaAnno::complement(ComplementExpr* expr) {
+  if (! deterministic)
+    fsa = fsa.determinize(NULL, [&](long, const vector<long>&){});
+  fsa = ~ fsa;
+  assoc.assign(fsa.n(), {});
+  deterministic = true;
+}
+
+void FsaAnno::concat(FsaAnno& rhs, ConcatExpr* expr) {
+  long ln = fsa.n(), rn = rhs.fsa.n();
+  for (long f: fsa.finals)
+    emplace_front(fsa.adj[f], epsilon, ln+rhs.fsa.start);
+  for (auto& es: rhs.fsa.adj) {
+    for (auto& e: es)
+      e.second += ln;
+    fsa.adj.emplace_back(move(es));
+  }
+  fsa.finals = move(rhs.fsa.finals);
+  for (long& f: fsa.finals)
+    f += ln;
+  assoc.resize(fsa.n());
+  REP(i, rhs.fsa.n())
+    assoc[ln+i] = move(rhs.assoc[i]);
+  if (expr)
+    add_assoc(*expr);
+  deterministic = false;
+}
+
+void FsaAnno::determinize(const vector<long>* starts, vector<vector<long>>* mapping) {
+  if (deterministic)
+    return;
+  decltype(assoc) new_assoc;
+  auto relate = [&](long id, const vector<long>& xs) {
+    if (id+1 > new_assoc.size()) {
+      new_assoc.resize(id+1);
+      if (mapping)
+        mapping->resize(id+1);
+    }
+    auto& as = new_assoc[id];
+    for (long x: xs)
+      as.insert(as.end(), ALL(assoc[x]));
+    sort_assoc(as);
+    if (mapping)
+      (*mapping)[id] = xs;
+  };
+  fsa = fsa.determinize(starts, relate);
+  assoc = move(new_assoc);
+  deterministic = true;
+}
+
+void FsaAnno::difference(FsaAnno& rhs, DifferenceExpr* expr) {
+  vector<vector<long>> rel0;
+  decltype(rhs.assoc) new_assoc;
+  auto relate0 = [&](long id, const vector<long>& xs) {
+    if (id+1 > rel0.size())
+      rel0.resize(id+1);
+    rel0[id] = xs;
+  };
+  auto relate = [&](long x) {
+    if (rel0.empty())
+      new_assoc.emplace_back(assoc[x]);
+    else {
+      new_assoc.emplace_back();
+      auto& as = new_assoc.back();
+      for (long u: rel0[x])
+        as.insert(as.end(), ALL(assoc[u]));
+      sort_assoc(as);
+    }
+  };
+  if (! deterministic)
+    fsa = fsa.determinize(NULL, relate0);
+  if (! rhs.deterministic)
+    rhs.fsa = rhs.fsa.determinize(NULL, [](long, const vector<long>&) {});
+  fsa = fsa.difference(rhs.fsa, relate);
+  assoc = move(new_assoc);
+  if (expr)
+    add_assoc(*expr);
+  deterministic = true;
+}
+
+FsaAnno FsaAnno::epsilon_fsa(EpsilonExpr* expr) {
+  FsaAnno r;
+  r.fsa.start = 0;
+  r.fsa.finals.push_back(0);
+  r.fsa.adj.resize(1);
+  r.assoc.resize(1);
+  if (expr)
+    r.add_assoc(*expr);
+  r.deterministic = true;
+  return r;
+}
+
+void FsaAnno::intersect(FsaAnno& rhs, IntersectExpr* expr) {
+  decltype(rhs.assoc) new_assoc;
+  vector<vector<long>> rel0, rel1;
+  auto relate0 = [&](long id, const vector<long>& xs) {
+    if (id+1 > rel0.size())
+      rel0.resize(id+1);
+    rel0[id] = xs;
+  };
+  auto relate1 = [&](long id, const vector<long>& xs) {
+    if (id+1 > rel1.size())
+      rel1.resize(id+1);
+    rel1[id] = xs;
+  };
+  auto relate = [&](long x, long y) {
+    new_assoc.emplace_back();
+    auto& as = new_assoc.back();
+    if (rel0.empty())
+      as.insert(as.end(), ALL(assoc[x]));
+    else
+      for (long u: rel0[x])
+        as.insert(as.end(), ALL(assoc[u]));
+    if (rel1.empty())
+      as.insert(as.end(), ALL(rhs.assoc[y]));
+    else
+      for (long v: rel1[y])
+        as.insert(as.end(), ALL(rhs.assoc[v]));
+    sort_assoc(as);
+  };
+  if (! deterministic)
+    fsa = fsa.determinize(NULL, relate0);
+  if (! rhs.deterministic)
+    rhs.fsa = rhs.fsa.determinize(NULL, relate1);
+  fsa = fsa.intersect(rhs.fsa, relate);
+  assoc = move(new_assoc);
+  if (expr)
+    add_assoc(*expr);
+  deterministic = true;
+}
+
+void FsaAnno::minimize(vector<vector<long>>* mapping) {
+  assert(deterministic);
+  decltype(assoc) new_assoc;
+  auto relate = [&](vector<long>& xs) {
+    new_assoc.emplace_back();
+    auto& as = new_assoc.back();
+    for (long x: xs)
+      as.insert(as.end(), ALL(assoc[x]));
+    sort_assoc(as);
+    if (mapping)
+      mapping->push_back(xs);
+  };
+  fsa = fsa.distinguish(relate);
+  assoc = move(new_assoc);
+}
+
+void FsaAnno::union_(FsaAnno& rhs, UnionExpr* expr) {
+  long ln = fsa.n(), rn = rhs.fsa.n(), src = ln+rn,
+       old_lsrc = fsa.start;
+  fsa.start = src;
+  for (long f: rhs.fsa.finals)
+    fsa.finals.push_back(ln+f);
+  for (auto& es: rhs.fsa.adj) {
+    for (auto& e: es)
+      e.second += ln;
+    fsa.adj.emplace_back(move(es));
+  }
+  fsa.adj.emplace_back();
+  fsa.adj[src].emplace_back(epsilon, old_lsrc);
+  fsa.adj[src].emplace_back(epsilon, ln+rhs.fsa.start);
+  assoc.resize(fsa.n());
+  REP(i, rhs.fsa.n())
+    assoc[ln+i] = move(rhs.assoc[i]);
+  if (expr)
+    add_assoc(*expr);
+  deterministic = false;
+}
+
+void FsaAnno::plus(PlusExpr* expr) {
+  for (long f: fsa.finals)
+    emplace_front(fsa.adj[f], epsilon, fsa.start);
+  if (expr)
+    add_assoc(*expr);
+  deterministic = false;
+}
+
+void FsaAnno::question(QuestionExpr* expr) {
+  long src = fsa.n(), sink = src+1, old_src = fsa.start;
+  fsa.start = src;
+  fsa.adj.emplace_back();
+  fsa.adj.emplace_back();
+  fsa.adj[src].emplace_back(epsilon, old_src);
+  fsa.adj[src].emplace_back(epsilon, sink);
+  fsa.finals.push_back(sink);
+  assoc.resize(fsa.n());
+  if (expr)
+    add_assoc(*expr);
+  deterministic = false;
+}
+
+void FsaAnno::repeat(RepeatExpr& expr) {
+  FsaAnno r = epsilon_fsa(NULL);
+  REP(i, expr.low) {
+    FsaAnno t = *this;
+    r.concat(t, NULL);
+  }
+  if (expr.high == LONG_MAX) {
+    star(NULL);
+    r.concat(*this, NULL);
+  } else if (expr.low < expr.high) {
+    FsaAnno rhs = epsilon_fsa(NULL), x = *this;
+    ROF(i, 0, expr.high-expr.low) {
+      FsaAnno t = x;
+      rhs.union_(t, NULL);
+      if (i) {
+        t = *this;
+        x.concat(t, NULL);
+      }
+    }
+    r.concat(rhs, NULL);
+  }
+  r.deterministic = false;
+  *this = move(r);
+}
+
+void FsaAnno::star(StarExpr* expr) {
+  long src = fsa.n(), sink = src+1, old_src = fsa.start;
+  fsa.start = src;
+  fsa.adj.emplace_back();
+  fsa.adj.emplace_back();
+  fsa.adj[src].emplace_back(epsilon, old_src);
+  fsa.adj[src].emplace_back(epsilon, sink);
+  for (long f: fsa.finals) {
+    sorted_emplace(fsa.adj[f], epsilon, old_src);
+    sorted_emplace(fsa.adj[f], epsilon, sink);
+  }
+  fsa.finals.assign(1, sink);
+  assoc.resize(fsa.n());
+  if (expr)
+    add_assoc(*expr);
+  deterministic = false;
+}
+
+FsaAnno FsaAnno::bracket(BracketExpr& expr) {
+  FsaAnno r;
+  r.fsa.start = 0;
+  r.fsa.finals = {1};
+  r.fsa.adj.resize(2);
+  for (auto& x: expr.intervals.to)
+    r.fsa.adj[0].emplace_back(x, 1);
+  r.assoc.resize(2);
+  r.add_assoc(expr);
+  r.deterministic = true;
+  return r;
+}
+
+FsaAnno FsaAnno::call(CallExpr& expr) {
+  // represented by (0, special, 1)
+  FsaAnno r;
+  r.fsa.start = 0;
+  r.fsa.finals = {1};
+  r.fsa.adj.resize(2);
+  r.fsa.adj[0].emplace_back(make_pair(call_label, call_label+1), 1);
+  call_label++;
+  r.assoc.resize(2);
+  r.add_assoc(expr);
+  r.deterministic = true;
+  return r;
+}
+
+FsaAnno FsaAnno::collapse(CollapseExpr& expr) {
+  // represented by (0, special, 1)
+  FsaAnno r;
+  r.fsa.start = 0;
+  r.fsa.finals = {1};
+  r.fsa.adj.resize(2);
+  r.fsa.adj[0].emplace_back(make_pair(collapse_label, collapse_label+1), 1);
+  collapse_label++;
+  r.assoc.resize(2);
+  r.add_assoc(expr);
+  r.deterministic = true;
+  return r;
+}
+
+FsaAnno FsaAnno::dot(DotExpr* expr) {
+  FsaAnno r;
+  r.fsa.start = 0;
+  r.fsa.finals = {1};
+  r.fsa.adj.resize(2);
+  r.fsa.adj[0].emplace_back(make_pair(0L, AB), 1);
+  r.assoc.resize(2);
+  if (expr)
+    r.add_assoc(*expr);
+  r.deterministic = true;
+  return r;
+}
+
+FsaAnno FsaAnno::embed(EmbedExpr& expr) {
+  if (expr.define_stmt) {
+    FsaAnno r = compiled[expr.define_stmt];
+    // change the labels to differentiate instances of CallExpr
+    REP(i, r.fsa.n()) {
+      auto it = upper_bound(ALL(r.fsa.adj[i]), make_pair(make_pair(call_label_base, LONG_MAX), LONG_MAX));
+      if (it != r.fsa.adj[i].begin() && call_label_base < (it-1)->first.second)
+        --it;
+      for (; it != r.fsa.adj[i].end() && it->first.first < call_label; ++it) {
+        assert(call_label_base <= it->first.first);
+        long t = it->first.second-it->first.first;
+        it->first.first = call_label;
+        call_label += t;
+        it->first.second = call_label;
+        assert(it->first.second <= call_label);
+      }
+    }
+    r.add_assoc(expr);
+    return r;
+  } else { // macro
+    FsaAnno r;
+    r.fsa.start = 0;
+    r.fsa.finals = {1};
+    r.fsa.adj.resize(2);
+    r.fsa.adj[0].emplace_back(make_pair(expr.macro_value, expr.macro_value+1), 1);
+    r.assoc.resize(2);
+    r.add_assoc(expr);
+    r.deterministic = true;
+    return r;
+  }
+}
+
+FsaAnno FsaAnno::literal(LiteralExpr& expr) {
+  FsaAnno r;
+  r.fsa.start = 0;
+  long len = 0;
+  if (opt_bytes) {
+    len = expr.literal.size();
+    r.fsa.adj.resize(len+1);
+    REP(i, expr.literal.size()) {
+      long c = (u8)expr.literal[i];
+      r.fsa.adj[i].emplace_back(make_pair(c, c+1), i+1);
+    }
+  } else {
+    for (i32 c, i = 0; i < expr.literal.size(); len++) {
+      U8_NEXT_OR_FFFD(expr.literal.c_str(), i, expr.literal.size(), c);
+      r.fsa.adj.emplace_back();
+      r.fsa.adj[len].emplace_back(make_pair(c, c+1), len+1);
+    }
+    r.fsa.adj.emplace_back();
+  }
+  r.fsa.finals.push_back(len);
+  r.assoc.resize(len+1);
+  r.add_assoc(expr);
+  r.deterministic = true;
+  return r;
+}
+
+void FsaAnno::substring_grammar() {
+  long src = fsa.n(), sink = src+1, old_src = fsa.start;
+  fsa.start = src;
+  fsa.adj.emplace_back();
+  fsa.adj.emplace_back();
+  REP(i, src) {
+    bool ok = true;
+    for (auto aa: assoc[i])
+      if (auto e = dynamic_cast<CollapseExpr*>(aa.first)) {
+        if (e->define_stmt->intact && has_inner(aa.second)) {
+          ok = false;
+          break;
+        }
+      } else if (aa.first->stmt->intact && has_inner(aa.second)) {
+        ok = false;
+        break;
+      }
+    if (ok || i == old_src)
+      fsa.adj[src].emplace_back(epsilon, i);
+    if (ok || fsa.is_final(i))
+      emplace_front(fsa.adj[i], epsilon, sink);
+  }
+  fsa.finals.assign(1, sink);
+  assoc.resize(fsa.n());
+  deterministic = false;
+}

+ 44 - 0
yanshi/src/fsa_anno.hh

@@ -0,0 +1,44 @@
+#pragma once
+#include "fsa.hh"
+#include "syntax.hh"
+
+enum class ExprTag {
+  start = 1,
+  inner = 2,
+  final = 4,
+};
+
+extern inline bool has_start(ExprTag x) { return long(x) & long(ExprTag::start); }
+extern inline bool has_inner(ExprTag x) { return long(x) & long(ExprTag::inner); }
+extern inline bool has_final(ExprTag x) { return long(x) & long(ExprTag::final); }
+
+bool operator<(ExprTag x, ExprTag y);
+bool assoc_has_expr(vector<pair<Expr*, ExprTag>>& as, const Expr* x);
+
+struct FsaAnno {
+  bool deterministic;
+  Fsa fsa;
+  vector<vector<pair<Expr*, ExprTag>>> assoc;
+  void accessible(const vector<long>* starts, vector<long>& mapping);
+  void add_assoc(Expr& expr);
+  void complement(ComplementExpr* expr);
+  void co_accessible(const vector<bool>* final, vector<long>& mapping);
+  void concat(FsaAnno& rhs, ConcatExpr* expr);
+  void determinize(const vector<long>* starts, vector<vector<long>>* mapping);
+  void difference(FsaAnno& rhs, DifferenceExpr* expr);
+  void intersect(FsaAnno& rhs, IntersectExpr* expr);
+  void minimize(vector<vector<long>>* mapping);
+  void plus(PlusExpr* expr);
+  void question(QuestionExpr* expr);
+  void repeat(RepeatExpr& expr);
+  void star(StarExpr* expr);
+  void substring_grammar();
+  void union_(FsaAnno& rhs, UnionExpr* expr);
+  static FsaAnno bracket(BracketExpr& expr);
+  static FsaAnno call(CallExpr& expr);
+  static FsaAnno collapse(CollapseExpr& expr);
+  static FsaAnno dot(DotExpr* expr);
+  static FsaAnno embed(EmbedExpr& expr);
+  static FsaAnno epsilon_fsa(EpsilonExpr* expr);
+  static FsaAnno literal(LiteralExpr& expr);
+};

+ 326 - 0
yanshi/src/lexer.l

@@ -0,0 +1,326 @@
+%{
+#include "lexer_helper.hh"
+#include "option.hh"
+#include "parser.hh"
+#include "syntax.hh"
+
+#include <limits.h>
+#include <stdlib.h>
+#include <string>
+#include <string.h>
+#include <unicode/utf8.h>
+using namespace std;
+
+#define YY_USER_ACTION                      \
+  do {                                      \
+    yylloc->start = yyget_extra(yyscanner); \
+    yylloc->end = yylloc->start + yyleng;   \
+    yyset_extra(yylloc->end, yyscanner);    \
+  } while (0);
+
+static string tmp_bracket, tmp_str;
+static long tmp_str_pos;
+static bool semicolon;
+
+static long invalid_escape(YYSTYPE* yylval, const char* text)
+{
+  yylval->errmsg = aprintf("invalid \\-escape: %s", text);
+  return INVALID_CHARACTER;
+}
+
+static int invalid_escape_octonary(YYSTYPE* yylval, const char* text)
+{
+  yylval->errmsg = aprintf("invalid number after \\-escape: %s", text);
+  return INVALID_CHARACTER;
+}
+
+static int invalid_escape_x(YYSTYPE* yylval, const char* text)
+{
+  yylval->errmsg = aprintf("invalid number after \\x-escape: %s", text);
+  return INVALID_CHARACTER;
+}
+
+static int invalid_escape_u(YYSTYPE* yylval, const char* text)
+{
+  yylval->errmsg = aprintf("invalid number after \\u-escape: %s", text);
+  return INVALID_CHARACTER;
+}
+
+static int invalid_escape_U(YYSTYPE* yylval, const char* text)
+{
+  yylval->errmsg = aprintf("invalid number after \\U-escape: %s", text);
+  return INVALID_CHARACTER;
+}
+
+static void unexpected_eof(YYSTYPE* yylval, const char* token_end)
+{
+  yylval->errmsg = aprintf("missing %s at end of file", token_end);
+}
+
+static void unexpected_newline(YYSTYPE* yylval, const char* token_end)
+{
+  yylval->errmsg = aprintf("missing %s at end of line", token_end);
+}
+
+static int unexpected_codepoint(YYSTYPE* yylval)
+{
+  yylval->errmsg = aprintf("cannot use Unicode codepoints");
+  return INVALID_CHARACTER;
+}
+
+extern "C" int raw_yywrap(yyscan_t yyscanner)
+{
+  semicolon = false;
+  return 1;
+}
+%}
+
+%option yywrap noinput
+%option reentrant
+%option extra-type="long"
+%option bison-bridge bison-locations
+%option prefix="raw_yy"
+%option stack
+
+%x EXPECT_CODE
+%x AFTER_ACTION_OP
+%x AFTER_EXPORT
+%x IN_BRACE
+%x IN_CODE
+%x IN_COMMENT
+%x IN_BRACKET
+%x IN_BRACKET_FIRST
+%x IN_LINE_COMMENT
+%s IN_PAREN
+%x IN_Q_STRING
+%x IN_QQ_STRING
+
+D			[0-9]
+H			[0-9A-Fa-f]
+L			[a-zA-Z_\x80-\xff]
+
+%%
+
+"::" return COLONCOLON;
+".." return DOTDOT;
+"&&" return AMPERAMPER;
+";" if (semicolon) return '\n';
+[-~!&*=+,.?|{}:] return yytext[0];
+"action" yy_push_state(EXPECT_CODE, yyscanner); return ACTION;
+"as" return AS;
+"c++" yy_push_state(EXPECT_CODE, yyscanner); return CPP;
+"epsilon" return EPSILON;
+"export" yy_push_state(AFTER_EXPORT, yyscanner); return EXPORT;
+"import" return IMPORT;
+"intact" return INTACT;
+"semicolon" semicolon = true;
+"nosemicolon" semicolon = false;
+{L}({L}|{D})* yylval->str = new string(yytext); return IDENT;
+{D}+ yylval->integer = atol(yytext); return INTEGER;
+"#define" return PREPROCESS_DEFINE;
+
+"#" yy_push_state(IN_LINE_COMMENT, yyscanner);
+"//" yy_push_state(IN_LINE_COMMENT, yyscanner);
+<IN_LINE_COMMENT>{
+  "\n" yy_pop_state(yyscanner); unput('\n'); yyset_extra(yylloc->end-1, yyscanner);
+  <<EOF>> yy_pop_state(yyscanner);
+  . {}
+}
+
+"/*" yy_push_state(IN_COMMENT, yyscanner);
+<IN_COMMENT>{
+  "*/" yy_pop_state(yyscanner);
+  <<EOF>> yy_pop_state(yyscanner);
+  .|\n {}
+}
+
+"(" yy_push_state(IN_PAREN, yyscanner); return '(';
+")" {
+  if (YY_START != IN_PAREN) {
+    unexpected_newline(yylval, ")");
+    return INVALID_CHARACTER;
+  }
+  yy_pop_state(yyscanner);
+  return ')';
+}
+
+"[" yy_push_state(IN_BRACKET_FIRST, yyscanner); return '[';
+<IN_BRACKET_FIRST>{
+  "^" BEGIN IN_BRACKET; return '^';
+  [^-\\\]\n] { yy_pop_state(yyscanner); yy_push_state(IN_BRACKET, yyscanner); yylval->integer = yytext[0]; return CHAR; }
+  "-" { yylval->integer = '-'; return CHAR; }
+}
+<IN_BRACKET>{
+  "]" {
+    yy_pop_state(yyscanner);
+    if (YY_START == INITIAL || YY_START == IN_PAREN)
+      return ']';
+  }
+  [^-\\\]\n] yylval->integer = yytext[0]; return CHAR;
+}
+<IN_BRACKET_FIRST,IN_BRACKET>{
+  \\[0-7]+ {
+    BEGIN IN_BRACKET;
+    long c = strtol(yytext+1, NULL, 8);
+    if (UCHAR_MAX < c)
+      return invalid_escape_octonary(yylval, yytext);
+    yylval->integer = c;
+    return CHAR;
+  }
+  \\u[0-9a-fA-F]+ {
+    BEGIN IN_BRACKET;
+    if (opt_bytes)
+      return unexpected_codepoint(yylval);
+    long c = strtol(yytext+2, NULL, 16), len = 0;
+    if (UINT16_MAX < c)
+      return invalid_escape_u(yylval, yytext);
+    yylval->integer = c;
+    return CHAR;
+  }
+  \\U[0-9a-fA-F]+ {
+    BEGIN IN_BRACKET;
+    if (opt_bytes)
+      return unexpected_codepoint(yylval);
+    long c = strtol(yytext+2, NULL, 16), len = 0;
+    if (MAX_CODEPOINT < c)
+      return invalid_escape_U(yylval, yytext);
+    yylval->integer = c;
+    return CHAR;
+  }
+  \\x[0-9a-fA-F]+ {
+    BEGIN IN_BRACKET;
+    long c = strtol(yytext+2, NULL, 16);
+    if (UCHAR_MAX < c)
+      return invalid_escape_x(yylval, yytext);
+    yylval->integer = c;
+    return CHAR;
+  }
+  \\a BEGIN IN_BRACKET; yylval->integer = '\a'; return CHAR;
+  \\b BEGIN IN_BRACKET; yylval->integer = '\b'; return CHAR;
+  \\f BEGIN IN_BRACKET; yylval->integer = '\f'; return CHAR;
+  \\n BEGIN IN_BRACKET; yylval->integer = '\n'; return CHAR;
+  \\r BEGIN IN_BRACKET; yylval->integer = '\r'; return CHAR;
+  \\t BEGIN IN_BRACKET; yylval->integer = '\t'; return CHAR;
+  \\v BEGIN IN_BRACKET; yylval->integer = '\v'; return CHAR;
+  \\. BEGIN IN_BRACKET; yylval->integer = yytext[1]; return CHAR;
+  - BEGIN IN_BRACKET; return '-';
+  "\n" unexpected_newline(yylval, "]"); return INVALID_CHARACTER;
+  <<EOF>> yy_pop_state(yyscanner); unexpected_eof(yylval, "]");
+}
+
+<AFTER_EXPORT>{ // optional 'BRACED_CODE' to specify extra parameters
+  "intact" yy_pop_state(yyscanner); return INTACT;
+  {L}({L}|{D})* yy_pop_state(yyscanner); yylval->str = new string(yytext); return IDENT;
+  "{" BEGIN IN_CODE; tmp_bracket.clear();
+  [ \t\n] {}
+  <<EOF>> yy_pop_state(yyscanner);
+}
+
+<EXPECT_CODE>{
+  {L}({L}|{D})* yylval->str = new string(yytext); return IDENT;
+  "{" BEGIN IN_CODE; tmp_bracket.clear();
+  [ \t\n] {}
+  <<EOF>> yy_pop_state(yyscanner);
+}
+
+[>@%$] yy_push_state(AFTER_ACTION_OP, yyscanner); return yytext[0];
+<AFTER_ACTION_OP>{
+  -?{D}+ yylval->integer = atol(yytext); return INTEGER;
+  {L}({L}|{D})* yy_pop_state(yyscanner); yylval->str = new string(yytext); return IDENT;
+  "{" BEGIN IN_CODE; tmp_bracket.clear();
+  [ \t\n]+ {}
+  <<EOF>> yy_pop_state(yyscanner);
+  . yylval->errmsg = strdup("invalid character"); return INVALID_CHARACTER;
+}
+<IN_CODE>{
+  "'" { tmp_bracket += '\''; yy_push_state(IN_Q_STRING, yyscanner); }
+  "\"" { tmp_bracket += '"'; yy_push_state(IN_QQ_STRING, yyscanner); }
+  "{" { tmp_bracket += '{'; yy_push_state(IN_CODE, yyscanner); }
+  "}" {
+    yy_pop_state(yyscanner);
+    if (YY_START == INITIAL || YY_START == IN_PAREN) {
+      yylval->str = new string(tmp_bracket);
+      return BRACED_CODE;
+    } else
+      tmp_bracket += '}';
+  }
+  .|"\n" tmp_bracket += yytext[0];
+  <<EOF>> yy_pop_state(yyscanner); unexpected_eof(yylval, "}");
+}
+
+' tmp_str.clear(); tmp_str_pos = yylloc->start; yy_push_state(IN_Q_STRING, yyscanner);
+"\"" tmp_str.clear(); tmp_str_pos = yylloc->start; yy_push_state(IN_QQ_STRING, yyscanner);
+<IN_Q_STRING>{
+  ' {
+    yy_pop_state(yyscanner);
+    if (YY_START == INITIAL || YY_START == IN_PAREN) {
+      yylval->str = new string(tmp_str);
+      yylloc->start = tmp_str_pos;
+      return STRING_LITERAL;
+    }
+    tmp_bracket += yytext;
+  }
+  <<EOF>> yy_pop_state(yyscanner); unexpected_eof(yylval, "'");
+}
+<IN_QQ_STRING>{
+  "\"" {
+    yy_pop_state(yyscanner);
+    if (YY_START == INITIAL || YY_START == IN_PAREN) {
+      yylval->str = new string(tmp_str);
+      yylloc->start = tmp_str_pos;
+      return STRING_LITERAL;
+    }
+    tmp_bracket += yytext;
+  }
+  <<EOF>> yy_pop_state(yyscanner); unexpected_eof(yylval, "\"");
+}
+
+<IN_Q_STRING,IN_QQ_STRING>{
+  \\[0-7]+ {
+    long c = strtol(yytext+1, NULL, 8);
+    if (UCHAR_MAX < c)
+      return invalid_escape_octonary(yylval, yytext);
+    tmp_str.push_back(c);
+    tmp_bracket += yytext;
+  }
+  \\x[0-9a-fA-F]+ {
+    long c = strtol(yytext+2, NULL, 16);
+    if (UCHAR_MAX < c)
+      return invalid_escape_x(yylval, yytext);
+    tmp_str.push_back(c);
+    tmp_bracket += yytext;
+  }
+  \\u[0-9a-fA-F]+ {
+    char s[4];
+    long c = strtol(yytext+2, NULL, 16), len = 0;
+    if (UINT16_MAX < c)
+      return invalid_escape_u(yylval, yytext);
+    U8_APPEND_UNSAFE(s, len, c);
+    tmp_str.insert(tmp_str.end(), s, s+len);
+    tmp_bracket += yytext;
+  }
+  \\U[0-9a-fA-F]+ {
+    char s[4];
+    long c = strtol(yytext+2, NULL, 16), len = 0;
+    if (MAX_CODEPOINT < c)
+      return invalid_escape_U(yylval, yytext);
+    U8_APPEND_UNSAFE(s, len, c);
+    tmp_str.insert(tmp_str.end(), s, s+len);
+    tmp_bracket += yytext;
+  }
+  \\a tmp_str += '\a'; tmp_bracket += yytext;
+  \\b tmp_str += '\b'; tmp_bracket += yytext;
+  \\f tmp_str += '\f'; tmp_bracket += yytext;
+  \\n tmp_str += '\n'; tmp_bracket += yytext;
+  \\r tmp_str += '\r'; tmp_bracket += yytext;
+  \\t tmp_str += '\t'; tmp_bracket += yytext;
+  \\v tmp_str += '\v'; tmp_bracket += yytext;
+  \\[\n\"\'?\\] tmp_str += yytext[1]; tmp_bracket += yytext;
+  \\. return invalid_escape(yylval, yytext);
+  .|\n tmp_str += yytext[0]; tmp_bracket += yytext[0];
+}
+
+\\\n {}
+"\n" if (YY_START == INITIAL && ! semicolon) return '\n';
+[ \t]+ {}
+. { yylval->errmsg = strdup("invalid character"); return INVALID_CHARACTER; }

+ 15 - 0
yanshi/src/lexer_helper.cc

@@ -0,0 +1,15 @@
+#include "common.hh"
+
+#include <cstdarg>
+#include <cstdio>
+using namespace std;
+
+char* aprintf(const char* fmt, ...)
+{
+  va_list va;
+  va_start(va, fmt);
+  char* r = NULL;
+  vasprintf(&r, fmt, va);
+  va_end(va);
+  return r;
+}

+ 4 - 0
yanshi/src/lexer_helper.hh

@@ -0,0 +1,4 @@
+#pragma once
+
+char* aprintf(const char* fmt, ...)
+  __attribute__((format(printf, 1, 2)));

+ 556 - 0
yanshi/src/loader.cc

@@ -0,0 +1,556 @@
+#include "common.hh"
+#include "compiler.hh"
+#include "loader.hh"
+#include "option.hh"
+#include "parser.hh"
+#include "repl.hh"
+
+#include <algorithm>
+#include <errno.h>
+#include <functional>
+#include <stdio.h>
+#include <stack>
+#include <string.h>
+#include <sys/stat.h>
+#include <sysexits.h>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+using namespace std;
+
+static map<pair<dev_t, ino_t>, Module> inode2module;
+static unordered_map<DefineStmt*, vector<DefineStmt*>> depended_by; // key ranges over all DefineStmt
+map<DefineStmt*, vector<Expr*>> used_as_call, used_as_collapse, used_as_embed;
+static DefineStmt* main_export;
+Module* main_module;
+FILE *output, *output_header;
+
+void print_module_info(Module& mo)
+{
+  yellow(); printf("filename: %s\n", mo.filename.c_str());
+  cyan(); puts("qualified imports:"); sgr0();
+  for (auto& x: mo.qualified_import)
+    printf("  %s as %s\n", x.second->filename.c_str(), x.first.c_str());
+  cyan(); puts("unqualified imports:"); sgr0();
+  for (auto& x: mo.unqualified_import)
+    printf("  %s\n", x->filename.c_str());
+  cyan(); puts("defined actions:"); sgr0();
+  for (auto& x: mo.defined_action)
+    printf("  %s\n", x.first.c_str());
+  cyan(); puts("defined:"); sgr0();
+  for (auto& x: mo.defined)
+    printf("  %s\n", x.first.c_str());
+}
+
+Stmt* resolve(Module& mo, const string qualified, const string& ident)
+{
+  if (qualified.size()) {
+    if (! mo.qualified_import.count(qualified))
+      return NULL;
+    auto it = mo.qualified_import[qualified]->defined.find(ident);
+    if (it == mo.qualified_import[qualified]->defined.end())
+      return NULL;
+    return it->second;
+  } else {
+    Stmt* r = NULL;
+    if (mo.macro.count(ident))
+      r = mo.macro[ident];
+    if (mo.defined.count(ident)) {
+      if (r) return (Stmt*)1;
+      r = mo.defined[ident];
+    }
+    for (auto* import: mo.unqualified_import) {
+      if (import->macro.count(ident)) {
+        if (r) return (Stmt*)1;
+        r = import->macro[ident];
+      }
+      if (import->defined.count(ident)) {
+        if (r) return (Stmt*)1;
+        r = import->defined[ident];
+      }
+    }
+    return r;
+  }
+}
+
+ActionStmt* resolve_action(Module& mo, const string qualified, const string& ident)
+{
+  if (qualified.size()) {
+    if (! mo.qualified_import.count(qualified))
+      return NULL;
+    auto it = mo.qualified_import[qualified]->defined_action.find(ident);
+    if (it == mo.qualified_import[qualified]->defined_action.end())
+      return NULL;
+    return it->second;
+  } else {
+    ActionStmt* r = NULL;
+    if (mo.defined_action.count(ident))
+      r = mo.defined_action[ident];
+    for (auto* import: mo.unqualified_import)
+      if (import->defined_action.count(ident)) {
+        if (r) return (ActionStmt*)1;
+        r = import->defined_action[ident];
+      }
+    return r;
+  }
+}
+
+struct ModuleImportDef : PreorderStmtVisitor {
+  Module& mo;
+  long& n_errors;
+  ModuleImportDef(Module& mo, long& n_errors) : mo(mo), n_errors(n_errors) {}
+
+  void visit(ActionStmt& stmt) override {
+    if (mo.defined_action.count(stmt.ident)) {
+      n_errors++;
+      mo.locfile.error(stmt.loc, "redefined '%s'", stmt.ident.c_str());
+    } else
+      mo.defined_action[stmt.ident] = &stmt;
+  }
+  // TODO report error: import 'aa.hs' (#define d 3) ; #define d 4
+  void visit(DefineStmt& stmt) override {
+    if (mo.defined.count(stmt.lhs) || mo.macro.count(stmt.lhs)) {
+      n_errors++;
+      mo.locfile.error(stmt.loc, "redefined '%s'", stmt.lhs.c_str());
+    } else {
+      mo.defined.emplace(stmt.lhs, &stmt);
+      stmt.module = &mo;
+      depended_by[&stmt]; // empty
+    }
+  }
+  void visit(ImportStmt& stmt) override {
+    Module* m = load_module(n_errors, stmt.filename);
+    if (! m) {
+      n_errors++;
+      mo.locfile.error(stmt.loc, "'%s': %s", stmt.filename.c_str(), errno ? strerror(errno) : "parse error");
+      return;
+    }
+    if (stmt.qualified.size())
+      mo.qualified_import[stmt.qualified] = m;
+    else if (count(ALL(mo.unqualified_import), m) == 0)
+      mo.unqualified_import.push_back(m);
+  }
+  void visit(PreprocessDefineStmt& stmt) override {
+    if (mo.defined.count(stmt.ident) || mo.macro.count(stmt.ident)) {
+      n_errors++;
+      mo.locfile.error(stmt.loc, "redefined '%s'", stmt.ident.c_str());
+    } else
+      mo.macro[stmt.ident] = &stmt;
+  }
+};
+
+struct ModuleUse : PrePostActionExprStmtVisitor {
+  Module& mo;
+  long& n_errors;
+  DefineStmt* stmt = NULL;
+  ModuleUse(Module& mo, long& n_errors) : mo(mo), n_errors(n_errors) {}
+
+  void pre_expr(Expr& expr) override {
+    expr.stmt = stmt;
+  }
+
+  void post_expr(Expr& expr) override {
+    for (auto a: expr.entering)
+      PrePostActionExprStmtVisitor::visit(*a.first);
+    for (auto a: expr.finishing)
+      PrePostActionExprStmtVisitor::visit(*a.first);
+    for (auto a: expr.leaving)
+      PrePostActionExprStmtVisitor::visit(*a.first);
+    for (auto a: expr.transiting)
+      PrePostActionExprStmtVisitor::visit(*a.first);
+  }
+
+  void visit(RefAction& action) override {
+    ActionStmt* r = resolve_action(mo, action.qualified, action.ident);
+    if (! r) {
+      n_errors++;
+      if (action.qualified.size())
+        mo.locfile.error(action.loc, "'%s::%s' undefined", action.qualified.c_str(), action.ident.c_str());
+      else
+        mo.locfile.error(action.loc, "'%s' undefined", action.ident.c_str());
+    } else if (r == (Stmt*)1) {
+      n_errors++;
+      mo.locfile.error(action.loc, "'%s' redefined", action.ident.c_str());
+    } else
+      action.define_stmt = r;
+  }
+
+  void visit(BracketExpr& expr) override {
+    for (auto& x: expr.intervals.to)
+      AB = max(AB, x.second);
+  }
+  void visit(CallExpr& expr) override {
+    Stmt* r = resolve(mo, expr.qualified, expr.ident);
+    if (! r)
+      error_undefined(expr.loc, expr.qualified, expr.ident);
+    else if (r == (Stmt*)1)
+      error_ambiguous(expr.loc, expr.ident);
+    else if (auto d = dynamic_cast<PreprocessDefineStmt*>(r))
+      error_misuse_macro("CallExpr", expr.loc, expr.qualified, expr.ident);
+    else if (auto d = dynamic_cast<DefineStmt*>(r)) {
+      used_as_call[d].push_back(&expr);
+      expr.define_stmt = d;
+    } else
+      assert(0);
+  }
+  void visit(CollapseExpr& expr) override {
+    Stmt* r = resolve(mo, expr.qualified, expr.ident);
+    if (! r)
+      error_undefined(expr.loc, expr.qualified, expr.ident);
+    else if (r == (Stmt*)1)
+      error_ambiguous(expr.loc, expr.ident);
+    else if (auto d = dynamic_cast<PreprocessDefineStmt*>(r))
+      error_misuse_macro("CollapseExpr", expr.loc, expr.qualified, expr.ident);
+    else if (auto d = dynamic_cast<DefineStmt*>(r)) {
+      used_as_collapse[d].push_back(&expr);
+      expr.define_stmt = d;
+    } else
+      assert(0);
+  }
+  void visit(DefineStmt& stmt) override {
+    this->stmt = &stmt;
+    PrePostActionExprStmtVisitor::visit(*stmt.rhs);
+    this->stmt = NULL;
+  }
+  void visit(EmbedExpr& expr) override {
+    // introduce dependency
+    Stmt* r = resolve(mo, expr.qualified, expr.ident);
+    if (! r)
+      error_undefined(expr.loc, expr.qualified, expr.ident);
+    else if (r == (Stmt*)1)
+      error_ambiguous(expr.loc, expr.ident);
+    else if (auto d = dynamic_cast<PreprocessDefineStmt*>(r)) {
+      // enlarge alphabet
+      expr.define_stmt = NULL;
+      expr.macro_value = d->value;
+      AB = max(AB, d->value+1);
+    } else if (auto d = dynamic_cast<DefineStmt*>(r)) {
+      depended_by[d].push_back(stmt);
+      used_as_embed[d].push_back(&expr);
+      expr.define_stmt = d;
+    } else
+      assert(0);
+  }
+private:
+  void error_undefined(const Location& loc, const string& qualified, const string& ident) {
+    n_errors++;
+    if (qualified.size())
+      mo.locfile.error(loc, "'%s::%s' undefined", qualified.c_str(), ident.c_str());
+    else
+      mo.locfile.error(loc, "'%s' undefined", ident.c_str());
+  }
+  void error_ambiguous(const Location& loc, const string& ident) {
+    n_errors++;
+    mo.locfile.error(loc, "ambiguous '%s'", ident.c_str());
+  }
+  void error_misuse_macro(const char* name, const Location& loc, const string& qualified, const string& ident) {
+    n_errors++;
+    if (qualified.size())
+      mo.locfile.error(loc, "macro '%s::%s' used as %s", qualified.c_str(), ident.c_str(), name);
+    else
+      mo.locfile.error(loc, "macro '%s' used as %s", ident.c_str(), name);
+  }
+};
+
+Module* load_module(long& n_errors, const string& filename)
+{
+  FILE* file = stdin;
+  if (filename != "-") {
+    file = fopen(filename.c_str(), "r");
+    for (string& include: opt_include_paths) {
+      if (file) break;
+      file = fopen((include+'/'+filename).c_str(), "r");
+    }
+  }
+  if (! file) {
+    n_errors++;
+    return NULL;
+  }
+
+  pair<dev_t, ino_t> inode{0, 0}; // stdin -> {0, 0}
+  if (file != stdin) {
+    struct stat sb;
+    if (fstat(fileno(file), &sb) < 0)
+      err_exit(EX_OSFILE, "fstat '%s'", filename.c_str());
+    inode = {sb.st_dev, sb.st_ino};
+  }
+  if (inode2module.count(inode)) {
+    fclose(file);
+    return &inode2module[inode];
+  }
+  Module& mo = inode2module[inode];
+
+  string module{file != stdin ? filename : "main"};
+  string::size_type t = module.find('.');
+  if (t != string::npos)
+    module.erase(t, module.size()-t);
+
+  long r;
+  char buf[BUF_SIZE];
+  string data;
+  while ((r = fread(buf, 1, sizeof buf, file)) > 0) {
+    data += string(buf, buf+r);
+    if (r < sizeof buf) break;
+  }
+  fclose(file);
+  if (data.empty() || data.back() != '\n')
+    data.push_back('\n');
+  LocationFile locfile(filename, data);
+
+  Stmt* toplevel = NULL;
+  mo.locfile = locfile;
+  mo.filename = filename;
+  long errors = parse(locfile, toplevel);
+  if (! toplevel) {
+    n_errors += errors;
+    mo.status = BAD;
+    mo.toplevel = NULL;
+    return &mo;
+  }
+  mo.toplevel = toplevel;
+  return &mo;
+}
+
+static vector<DefineStmt*> topo_define_stmts(long& n_errors)
+{
+  vector<DefineStmt*> topo;
+  vector<DefineStmt*> st;
+  unordered_map<DefineStmt*, i8> vis; // 0: unvisited; 1: in stack; 2: visited; 3: in a cycle
+  unordered_map<DefineStmt*, long> cnt;
+  function<bool(DefineStmt*)> dfs = [&](DefineStmt* u) {
+    if (vis[u] == 2)
+      return false;
+    if (vis[u] == 3)
+      return true;
+    if (vis[u] == 1) {
+      u->module->locfile.error_context(u->loc, "'%s': circular embedding", u->lhs.c_str());
+      long i = st.size();
+      while (st[i-1] != u)
+        i--;
+      st.push_back(st[i-1]);
+      for (; i < st.size(); i++) {
+        vis[st[i]] = 3;
+        fputs("  ", stderr);
+        st[i]->module->locfile.error_context(st[i]->loc, "required by %s", st[i]->lhs.c_str());
+      }
+      fputs("\n", stderr);
+      return true;
+    }
+    cnt[u] = u->export_ ? 1 : 0;
+    vis[u] = 1;
+    st.push_back(u);
+    bool cycle = false;
+    for (auto v: depended_by[u])
+      if (dfs(v))
+        cycle = true;
+      else
+        cnt[u] += cnt[v];
+    st.pop_back();
+    vis[u] = 2;
+    topo.push_back(u);
+    return cycle;
+  };
+  for (auto& d: depended_by)
+    if (! vis[d.first] && dfs(d.first)) // detected cycle
+      n_errors++;
+  reverse(ALL(topo));
+  if (opt_dump_embed) {
+    magenta(); printf("=== Embed\n"); sgr0();
+    for (auto stmt: topo)
+      if (cnt[stmt] > 0)
+        printf("count(%s::%s) = %ld\n", stmt->module->filename.c_str(), stmt->lhs.c_str(), cnt[stmt]);
+  }
+  return topo;
+}
+
+long load(const string& filename)
+{
+  long n_errors = 0;
+  Module* mo = load_module(n_errors, filename);
+  if (! mo) {
+    err_exit(EX_OSFILE, "fopen", filename.c_str());
+    return n_errors;
+  }
+  if (mo->status == BAD)
+    return n_errors;
+  main_module = mo;
+
+  DP(1, "Processing import & def");
+  for(;;) {
+    bool done = true;
+    for (auto& it: inode2module)
+      if (it.second.status == UNPROCESSED) {
+        done = false;
+        Module& mo = it.second;
+        mo.status = GOOD;
+        long old = n_errors;
+        ModuleImportDef p{mo, n_errors};
+        for (Stmt* x = mo.toplevel; x; x = x->next)
+          x->accept(p);
+        mo.status = old == n_errors ? GOOD : BAD;
+      }
+    if (done) break;
+  }
+  if (n_errors)
+    return n_errors;
+
+  DP(1, "Processing use");
+  for (auto& it: inode2module)
+    if (it.second.status == GOOD) {
+      Module& mo = it.second;
+      ModuleUse p{mo, n_errors};
+      for (Stmt* x = mo.toplevel; x; x = x->next)
+        x->accept(p);
+    }
+  if (n_errors)
+    return n_errors;
+
+  // warning: not used solely as CallExpr, CollapseExpr or EmbedExpr
+  {
+    auto it0 = used_as_call.begin(), it0e = used_as_call.end(),
+         it1 = used_as_collapse.begin(), it1e = used_as_collapse.end(),
+         it2 = used_as_embed.begin(), it2e = used_as_embed.end();
+    while (it0 != it0e || it1 != it1e || it2 != it2e) {
+      long k = 0;
+      long c = 0;
+      DefineStmt* x = NULL;
+      if (it0 != it0e && (! x || it0->first < x)) x = it0->first;
+      if (it1 != it1e && (! x || it1->first < x)) x = it1->first;
+      if (it2 != it2e && (! x || it2->first < x)) x = it2->first;
+      if (it0 != it0e && it0->first == x) c++;
+      if (it1 != it1e && it1->first == x) c++;
+      if (it2 != it2e && it2->first == x) c++;
+      if (c > 1) {
+        x->module->locfile.warning(x->loc, "'%s' is not used solely as CallExpr, CollapseExpr or EmbedExpr", x->lhs.c_str());
+        if (it0 != it0e && it0->first == x)
+          for (auto* y: it0->second) {
+            fputs("  ", stderr);
+            y->stmt->module->locfile.warning_context(y->loc, "required by %s", y->stmt->lhs.c_str());
+          }
+        if (it1 != it1e && it1->first == x)
+          for (auto* y: it1->second) {
+            fputs("  ", stderr);
+            y->stmt->module->locfile.warning_context(y->loc, "required by %s", y->stmt->lhs.c_str());
+          }
+        if (it2 != it2e && it2->first == x)
+          for (auto* y: it2->second) {
+            fputs("  ", stderr);
+            y->stmt->module->locfile.warning_context(y->loc, "required by %s", y->stmt->lhs.c_str());
+          }
+      }
+      if (it0 != it0e && it0->first == x) ++it0, c++;
+      if (it1 != it1e && it1->first == x) ++it1, c++;
+      if (it2 != it2e && it2->first == x) ++it2, c++;
+    }
+  }
+
+  if (opt_dump_module) {
+    magenta(); printf("=== Module\n"); sgr0();
+    for (auto& it: inode2module)
+      if (it.second.status == GOOD) {
+        Module& mo = it.second;
+        print_module_info(mo);
+      }
+    puts("");
+  }
+
+  if (opt_dump_tree) {
+    magenta(); printf("=== Tree\n"); sgr0();
+    StmtPrinter p;
+    for (auto& it: inode2module)
+      if (it.second.status == GOOD) {
+        Module& mo = it.second;
+        yellow(); printf("filename: %s\n", mo.filename.c_str()); sgr0();
+        for (Stmt* x = mo.toplevel; x; x = x->next)
+          x->accept(p);
+      }
+    puts("");
+  }
+
+  DP(1, "Topological sorting");
+  vector<DefineStmt*> topo = topo_define_stmts(n_errors);
+  if (n_errors)
+    return n_errors;
+
+  if (opt_check)
+    return 0;
+
+  // AB has been updated by ModuleUse
+  action_label_base = action_label = AB;
+  call_label_base = call_label = action_label+1000000;
+  collapse_label_base = collapse_label = call_label+1000000;
+
+  DP(1, "Compiling DefineStmt");
+  for (auto stmt: topo)
+    compile(stmt);
+
+  output = strcmp(opt_output_filename, "-") ? fopen(opt_output_filename, "w") : stdout;
+  if (! output) {
+    n_errors++;
+    err_exit(EX_OSFILE, "fopen", opt_output_filename);
+    return n_errors;
+  }
+
+  unordered_map<DefineStmt*, vector<pair<long, long>>> stmt2call_addr;
+  DP(1, "Compiling exporting DefineStmt (coalescing referenced CallExpr/CollapseExpr)");
+  for (Stmt* x = main_module->toplevel; x; x = x->next)
+    if (auto xx = dynamic_cast<DefineStmt*>(x))
+      if (xx->export_ && ! compile_export(xx))
+        n_errors++;
+  if (n_errors)
+    return n_errors;
+
+  for (Stmt* x = main_module->toplevel; x; x = x->next)
+    if (auto xx = dynamic_cast<DefineStmt*>(x))
+      if (xx->export_) {
+        FsaAnno& anno = compiled[xx];
+        if (opt_dump_automaton)
+          print_automaton(anno.fsa);
+        if (opt_dump_assoc)
+          print_assoc(anno);
+      }
+
+  if (opt_mode == Mode::cxx) {
+    if (opt_output_header_filename) {
+      output_header = fopen(opt_output_header_filename, "w");
+      if (! output_header) {
+        n_errors++;
+        err_exit(EX_OSFILE, "fopen", opt_output_header_filename);
+        return n_errors;
+      }
+    }
+    DP(1, "Generating C++");
+    generate_cxx(mo);
+    if (output_header)
+      fclose(output_header);
+  } else if (opt_mode == Mode::graphviz) {
+    DP(1, "Generating Graphviz dot");
+    generate_graphviz(mo);
+  } else if (opt_mode == Mode::interactive) {
+    DP(1, "Testing given string");
+    DefineStmt* main_export = NULL;
+    for (Stmt* x = main_module->toplevel; x; x = x->next)
+      if (auto xx = dynamic_cast<DefineStmt*>(x))
+        if (xx->export_) {
+          main_export = xx;
+          break;
+        }
+    if (! main_export)
+      puts("no exporting DefineStmt");
+    else {
+      printf("Testing %s\n", main_export->lhs.c_str());
+      repl(main_export);
+    }
+  }
+
+  fclose(output);
+  return n_errors;
+}
+
+void unload_all()
+{
+  for (auto& it: inode2module) {
+    Module& mo = it.second;
+    stmt_free(mo.toplevel);
+  }
+}

+ 35 - 0
yanshi/src/loader.hh

@@ -0,0 +1,35 @@
+#pragma once
+#include "syntax.hh"
+
+#include <map>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+using std::map;
+using std::set;
+using std::string;
+using std::unordered_map;
+using std::vector;
+
+enum ModuleStatus { UNPROCESSED = 0, BAD, GOOD };
+
+struct Module {
+  ModuleStatus status;
+  LocationFile locfile;
+  string filename;
+  Stmt* toplevel;
+  unordered_map<string, DefineStmt*> defined;
+  vector<Module*> unqualified_import;
+  unordered_map<string, Module*> qualified_import;
+  unordered_map<string, ActionStmt*> defined_action;
+  unordered_map<string, PreprocessDefineStmt*> macro;
+};
+
+Stmt* resolve(Module& mo, const string qualified, const string& ident);
+long load(const string& filename);
+Module* load_module(long& n_errors, const string& filename);
+void unload_all();
+extern Module* main_module;
+extern FILE *output, *output_header;
+extern map<DefineStmt*, vector<Expr*>> used_as_call, used_as_collapse, used_as_embed;

+ 113 - 0
yanshi/src/location.cc

@@ -0,0 +1,113 @@
+#include "common.hh"
+#include "location.hh"
+
+#include <algorithm>
+#include <cstdarg>
+using namespace std;
+
+LocationFile::LocationFile(const string& filename, const string& data) : filename(filename), data(data)
+{
+  // data ends with '\n'
+  long nlines = count(data.begin(), data.end(), '\n');
+  linemap.assign(nlines+1, 0);
+  long line = 1;
+  for (long i = 0; i < data.size(); i++)
+    if (data[i] == '\n')
+      linemap[line++] = i+1;
+}
+
+void LocationFile::context(const Location& loc) const
+{
+  long line1, col1, line2, col2;
+  locate(loc, line1, col1, line2, col2);
+  if (line1 == line2) {
+    fputs("  ", stderr);
+    FOR(i, linemap[line1], line1+1 < linemap.size() ? linemap[line1+1] : data.size()) {
+      if (i == loc.start)
+        magenta();
+      fputc(data[i], stderr);
+      if (i+1 == loc.end)
+        sgr0();
+    }
+  } else {
+    bool first = true;
+    fputs("  ", stderr);
+    FOR(i, linemap[line1], linemap[line1+1]) {
+      if (i == loc.start)
+        magenta();
+      fputc(data[i], stderr);
+    }
+    if (line2-line1 < 8) {
+      FOR(i, linemap[line1+1], linemap[line2]) {
+        if (first) { first = false; fputs("  ", stderr); }
+        fputc(data[i], stderr);
+        if (data[i] == '\n') first = true;
+      }
+    } else {
+      FOR(i, linemap[line1+1], linemap[line1+4]) {
+        if (first) { first = false; fputs("  ", stderr); }
+        fputc(data[i], stderr);
+        if (data[i] == '\n') first = true;
+      }
+      fputs("  ........\n", stderr);
+      FOR(i, linemap[line2-3], linemap[line2]) {
+        if (first) { first = false; fputs("  ", stderr); }
+        fputc(data[i], stderr);
+        if (data[i] == '\n') first = true;
+      }
+    }
+    FOR(i, linemap[line2], line2+1 < linemap.size() ? linemap[line2+1] : data.size()) {
+      if (first) { first = false; fputs("  ", stderr); }
+      fputc(data[i], stderr);
+      if (i+1 == loc.end)
+        sgr0();
+    }
+  }
+}
+
+void LocationFile::locate(const Location& loc, long& line1, long& col1, long& line2, long& col2) const
+{
+  line1 = upper_bound(ALL(linemap), loc.start) - linemap.begin() - 1;
+  line2 = upper_bound(ALL(linemap), max(loc.end-1, 0L)) - linemap.begin() - 1;
+  col1 = loc.start - linemap[line1];
+  col2 = loc.end - linemap[line2];
+}
+
+void LocationFile::report_location(const Location& loc) const
+{
+  long line1, col1, line2, col2;
+  locate(loc, line1, col1, line2, col2);
+  yellow(2);
+  fprintf(stderr, "%s ", filename.c_str());
+  cyan(2);
+  if (line1 == line2)
+    fprintf(stderr, "%ld:%ld-%ld ", line1+1, col1+1, col2);
+  else
+    fprintf(stderr, "%ld-%ld:%ld-%ld ", line1+1, line2+1, col1+1, col2);
+}
+
+void LocationFile::error(const Location& loc, const char* fmt, ...) const
+{
+  report_location(loc);
+  red(2);
+  fprintf(stderr, "error ");
+  va_list va;
+  va_start(va, fmt);
+  vfprintf(stderr, fmt, va);
+  va_end(va);
+  fputs("\n", stderr);
+  sgr0(2);
+}
+
+void LocationFile::warning(const Location& loc, const char* fmt, ...) const
+{
+  report_location(loc);
+  yellow(2);
+  fprintf(stderr, "warning ");
+  va_list va;
+  va_start(va, fmt);
+  vfprintf(stderr, fmt, va);
+  va_end(va);
+  fputs("\n", stderr);
+  sgr0(2);
+}

+ 32 - 0
yanshi/src/location.hh

@@ -0,0 +1,32 @@
+#pragma once
+#include "common.hh"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+struct Location { long start, end; };
+
+struct LocationFile {
+  std::string filename, data;
+  std::vector<long> linemap;
+
+  LocationFile() = default;
+  LocationFile(const std::string& filename, const std::string& data);
+  LocationFile& operator=(const LocationFile&) = default;
+  void context(const Location& loc) const;
+  void locate(const Location& loc, long& line1, long& col1, long& line2, long& col2) const;
+  void report_location(const Location& loc) const;
+  void error(const Location& loc, const char* fmt, ...) const;
+  void warning(const Location& loc, const char* fmt, ...) const;
+  template<class... Args>
+  void error_context(const Location& loc, const char* fmt, Args&&... args) const {
+    error(loc, fmt, std::forward<Args>(args)...);
+    context(loc);
+  }
+  template<class... Args>
+  void warning_context(const Location& loc, const char* fmt, Args&&... args) const {
+    warning(loc, fmt, std::forward<Args>(args)...);
+    context(loc);
+  }
+};

+ 154 - 0
yanshi/src/main.cc

@@ -0,0 +1,154 @@
+#include "common.hh"
+#include "fsa.hh"
+#include "loader.hh"
+#include "option.hh"
+
+#include <errno.h>
+#include <getopt.h>
+#include <locale.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <string>
+#include <sysexits.h>
+#include <unistd.h>
+using namespace std;
+
+void print_help(FILE *fh)
+{
+  fprintf(fh, "Usage: %s [OPTIONS] dir\n", program_invocation_short_name);
+  fputs(
+        "\n"
+        "Options:\n"
+        "  -b,--bytes                make labels range over [0,256), Unicode literals will be treated as UTF-8 bytes\n"
+        "  -C                        generate C source code (default: C++)\n"
+        "  --check                   check syntax & use/def\n"
+        "  --debug                   debug level\n"
+        "  --debug-output            filename for debug output\n"
+        "  --dump-action             dump associated actions for each edge\n"
+        "  --dump-assoc              dump associated AST Expr for each state\n"
+        "  --dump-automaton          dump automata\n"
+        "  --dump-embed              dump statistics of EmbedExpr\n"
+        "  --dump-module             dump module use/def/...\n"
+        "  --dump-tree               dump AST\n"
+        "  --extern-c                generate extern \"C\" specifier\n"
+        "  -G,--graph <dir>          output a Graphviz dot file\n"
+        "  -I,--import <dir>         add <dir> to search path for 'import'\n"
+        "  -i,--interactive          interactive mode\n"
+        "  --max-return-stack        max length of return stack in C generator (default: 100)\n"
+        "  -k,--keep-inaccessible    do not perform accessible/co-accessible\n"
+        "  -S,--standalone           generate header and 'main()'\n"
+        "  --substring-grammar       construct regular approximation of the substring grammar. Inner states of nonterminals labeled 'intact' are not connected to start/final\n"
+        "  -o,--output <file>        .cc output filename\n"
+        "  -O,--output-header <file> .hh output filename\n"
+        "  -h, --help                display this help and exit\n"
+        "\n"
+        , fh);
+  exit(fh == stdout ? 0 : EX_USAGE);
+}
+
+int main(int argc, char *argv[])
+{
+  setlocale(LC_ALL, "");
+  int opt;
+  static struct option long_options[] = {
+    {"bytes",               no_argument,       0,   'b'},
+    {"check",               required_argument, 0,   'c'},
+    {"debug",               required_argument, 0,   'd'},
+    {"debug-output",        required_argument, 0,   'l'},
+    {"dump-action",         no_argument,       0,   1000},
+    {"dump-assoc",          no_argument,       0,   1001},
+    {"dump-automaton",      no_argument,       0,   1002},
+    {"dump-embed",          no_argument,       0,   1003},
+    {"dump-module",         no_argument,       0,   1004},
+    {"dump-tree",           no_argument,       0,   1005},
+    {"extern-c",            no_argument,       0,   1007},
+    {"graph",               no_argument,       0,   'G'},
+    {"import",              required_argument, 0,   'I'},
+    {"interactive",         no_argument,       0,   'i'},
+    {"max-return-stack",    required_argument, 0,   1006},
+    {"keep-inaccessible",   no_argument,       0,   'k'},
+    {"standalone",          no_argument,       0,   'S'},
+    {"substring-grammar",   no_argument,       0,   's'},
+    {"output",              required_argument, 0,   'o'},
+    {"output-header",       required_argument, 0,   'O'},
+    {"help",                no_argument,       0,   'h'},
+    {0,                     0,                 0,   0},
+  };
+
+  while ((opt = getopt_long(argc, argv, "bCDcd:GhI:ikl:O:o:Ss", long_options, NULL)) != -1) {
+    switch (opt) {
+    case 'b':
+      opt_bytes = true;
+      AB = 256;
+      break;
+    case 'C':
+      opt_gen_c = true;
+      break;
+    case 'D':
+      break;
+    case 'c':
+      opt_check = true;
+      break;
+    case 'd':
+      debug_level = get_long(optarg);
+      break;
+    case 'G':
+      opt_mode = Mode::graphviz;
+      break;
+    case 'h':
+      print_help(stdout);
+      break;
+    case 'I':
+      opt_include_paths.push_back(string(optarg));
+      break;
+    case 'i':
+      opt_mode = Mode::interactive;
+      break;
+    case 'k':
+      opt_keep_inaccessible = true;
+      break;
+    case 'l':
+      if (debug_file)
+        err_exit(EX_USAGE, "multiple '-l'");
+      debug_file = fopen(optarg, "w");
+      if (! debug_file)
+        err_exit(EX_OSFILE, "fopen");
+      break;
+    case 'O':
+      opt_output_header_filename = optarg;
+      break;
+    case 'o':
+      opt_output_filename = optarg;
+      break;
+    case 'S':
+      opt_standalone = true;
+      break;
+    case 's':
+      opt_substring_grammar = true;
+      break;
+    case 1000: opt_dump_action = true; break;
+    case 1001: opt_dump_assoc = true; break;
+    case 1002: opt_dump_automaton = true; break;
+    case 1003: opt_dump_embed = true; break;
+    case 1004: opt_dump_module = true; break;
+    case 1005: opt_dump_tree = true; break;
+    case 1006:
+      opt_max_return_stack = get_long(optarg);
+      break;
+    case 1007: opt_gen_extern_c = true; break;
+    case '?':
+      print_help(stderr);
+      break;
+    }
+  }
+  if (! debug_file)
+    debug_file = stderr;
+  argc -= optind;
+  argv += optind;
+
+  long n_errors = load(argc ? argv[0] : "-");
+  unload_all();
+  fclose(debug_file);
+  return n_errors ? 2 : 0;
+}

+ 13 - 0
yanshi/src/option.cc

@@ -0,0 +1,13 @@
+#include "common.hh"
+#include "option.hh"
+#include <stdio.h>
+
+bool opt_bytes, opt_check, opt_dump_action, opt_dump_assoc, opt_dump_automaton, opt_dump_embed, opt_dump_module, opt_dump_tree, opt_gen_c, opt_gen_extern_c, opt_keep_inaccessible, opt_standalone, opt_substring_grammar;
+
+long AB = MAX_CODEPOINT+1, opt_max_return_stack = 100;
+long debug_level = 3;
+FILE* debug_file;
+const char* opt_output_filename = "-";
+const char* opt_output_header_filename;
+Mode opt_mode = Mode::cxx;
+vector<string> opt_include_paths;

+ 13 - 0
yanshi/src/option.hh

@@ -0,0 +1,13 @@
+#pragma once
+#include <string>
+#include <vector>
+using std::string;
+using std::vector;
+
+extern bool opt_bytes, opt_check, opt_dump_action, opt_dump_assoc, opt_dump_automaton, opt_dump_embed, opt_dump_module, opt_dump_tree, opt_gen_c, opt_gen_extern_c, opt_keep_inaccessible, opt_standalone, opt_substring_grammar;
+extern long AB, opt_max_return_stack;
+extern const char* opt_output_filename;
+extern const char* opt_output_header_filename;
+enum class Mode {cxx, graphviz, interactive};
+extern Mode opt_mode;
+extern vector<string> opt_include_paths;

+ 270 - 0
yanshi/src/parser.y

@@ -0,0 +1,270 @@
+%code requires {
+#include "common.hh"
+#include "location.hh"
+#include "option.hh"
+#include "syntax.hh"
+
+#include <limits.h>
+#include <unicode/utf8.h>
+
+#define YYINITDEPTH 1000
+#define YYLTYPE Location
+#define YYLLOC_DEFAULT(Loc, Rhs, N)             \
+  do {                                          \
+    if (N) {                                    \
+      (Loc).start = YYRHSLOC(Rhs, 1).start;     \
+      (Loc).end = YYRHSLOC(Rhs, N).end;         \
+    } else {                                    \
+      (Loc).start = YYRHSLOC(Rhs, 0).start;     \
+      (Loc).end = YYRHSLOC(Rhs, 0).end;         \
+    }                                           \
+  } while (0)
+
+int parse(const LocationFile& locfile, Stmt*& res);
+}
+
+%locations
+%error-verbose
+%define api.pure
+
+%parse-param {Stmt*& res}
+%parse-param {long& errors}
+%parse-param {const LocationFile& locfile}
+%parse-param {void** lexer}
+%lex-param {Stmt*& res}
+%lex-param {long& errors}
+%lex-param {const LocationFile& locfile}
+%lex-param {void** lexer}
+
+%union {
+  long integer;
+  string* str;
+  DisjointIntervals* intervals;
+  Action* action;
+  Expr* expr;
+  Stmt* stmt;
+  char* errmsg;
+}
+%destructor { delete $$; } <str>
+%destructor { delete $$; } <action>
+%destructor { delete $$; } <expr>
+%destructor { delete $$; } <intervals>
+%destructor { delete $$; } <stmt>
+
+%token ACTION AMPERAMPER AS COLONCOLON CPP DOTDOT EPSILON EXPORT IMPORT INTACT INVALID_CHARACTER PREPROCESS_DEFINE
+%token <integer> CHAR INTEGER
+%token <str> IDENT
+%token <str> BRACED_CODE
+%token <str> STRING_LITERAL
+
+%type <action> action
+%type <expr> concat_expr difference_expr factor repeat intersect_expr union_expr union_expr2 unop_expr
+%type <intervals> bracket bracket_items
+%type <stmt> define_stmt preprocess stmt stmt_list
+
+%{
+#include "lexer.hh"
+
+#define FAIL(loc, errmsg)                                          \
+  do {                                                             \
+    Location l = loc;                                              \
+    yyerror(&l, res, errors, locfile, lexer, errmsg);              \
+  } while (0)
+
+void yyerror(YYLTYPE* loc, Stmt*& res, long& errors, const LocationFile& locfile, yyscan_t* lexer, const char *errmsg)
+{
+  errors++;
+  locfile.error_context(*loc, "%s", errmsg);
+}
+
+int yylex(YYSTYPE* yylval, YYLTYPE* loc, Stmt*& res, long& errors, const LocationFile& locfile, yyscan_t* lexer)
+{
+  int token = raw_yylex(yylval, loc, *lexer);
+  if (token == INVALID_CHARACTER) {
+    FAIL(*loc, yylval->errmsg ? yylval->errmsg : "invalid character");
+    free(yylval->errmsg);
+  }
+  return token;
+}
+
+#define gen_repeat(x, inner, low, high) \
+  if (low < 0) {                     \
+    FAIL(yyloc, "negative"); \
+  } \
+  if (low > high) { \
+    FAIL(yyloc, "low > high"); \
+  } \
+  x = new RepeatExpr(inner, low, high)
+%}
+
+%%
+
+toplevel:
+  stmt_list { res = $1; }
+
+stmt_list:
+    %empty { $$ = new EmptyStmt; }
+  | '\n' stmt_list { $$ = $2; }
+  | stmt stmt_list { $1->next = $2; $2->prev = $1; $$ = $1; }
+  | error stmt_list { $$ = $2; }
+
+stmt:
+    define_stmt { $$ = $1; }
+  | preprocess '\n' { $$ = $1; }
+  | IMPORT STRING_LITERAL AS IDENT '\n' { $$ = new ImportStmt(*$2, *$4); delete $2; delete $4; $$->loc = yyloc; }
+  | IMPORT STRING_LITERAL '\n' { string t; $$ = new ImportStmt(*$2, t); delete $2; $$->loc = yyloc; }
+  | ACTION IDENT BRACED_CODE '\n' { $$ = new ActionStmt(*$2, *$3); delete $2; delete $3; $$->loc = yyloc; }
+  | CPP BRACED_CODE '\n' { $$ = new CppStmt(*$2); delete $2; $$->loc = yyloc; }
+
+preprocess:
+    PREPROCESS_DEFINE IDENT INTEGER { $$ = new PreprocessDefineStmt(*$2, $3); delete $2; $$->loc = yyloc; }
+
+eq:
+    '='
+  | ':'
+
+define_stmt:
+    IDENT eq union_expr '\n' { $$ = new DefineStmt(*$1, $3); delete $1; $$->loc = yyloc; }
+  | IDENT eq '|' union_expr '\n' { $$ = new DefineStmt(*$1, $4); delete $1; $$->loc = yyloc; }
+  | IDENT eq '\n' union_expr2 '\n' { $$ = new DefineStmt(*$1, $4); delete $1; $$->loc = yyloc; }
+  | IDENT eq '\n' '|' union_expr2 '\n' { $$ = new DefineStmt(*$1, $5); delete $1; $$->loc = yyloc; }
+  | EXPORT define_stmt { $$ = $2; ((DefineStmt*)$$)->export_ = true; $$->loc = yyloc; }
+  | EXPORT BRACED_CODE define_stmt { $$ = $3; ((DefineStmt*)$$)->export_ = true; ((DefineStmt*)$$)->export_params = *$2; delete $2; $$->loc = yyloc; }
+  | INTACT define_stmt { $$ = $2; ((DefineStmt*)$$)->intact = true; $$->loc = yyloc; }
+
+union_expr:
+    intersect_expr { $$ = $1; }
+  | union_expr '|' intersect_expr { $$ = new UnionExpr($1, $3); $$->loc = yyloc; }
+
+union_expr2:
+    intersect_expr { $$ = $1; }
+  | union_expr2 '|' intersect_expr { $$ = new UnionExpr($1, $3); $$->loc = yyloc; }
+  | union_expr2 '\n' '|' intersect_expr { $$ = new UnionExpr($1, $4); $$->loc = yyloc; }
+
+intersect_expr:
+    difference_expr { $$ = $1; }
+  | intersect_expr AMPERAMPER difference_expr { $$ = new IntersectExpr($1, $3); $$->loc = yyloc; }
+
+difference_expr:
+    concat_expr { $$ = $1; }
+  | difference_expr '-' concat_expr { $$ = new DifferenceExpr($1, $3); $$->loc = yyloc; }
+
+concat_expr:
+    unop_expr { $$ = $1; }
+  | concat_expr unop_expr { $$ = new ConcatExpr($1, $2); $$->loc = yyloc; }
+
+unop_expr:
+    factor { $$ = $1; }
+  | '~' unop_expr { $$ = new ComplementExpr($2); $$->loc = yyloc; }
+
+factor:
+    EPSILON { $$ = new EpsilonExpr; $$->loc = yyloc; }
+  | IDENT { string t; $$ = new EmbedExpr(t, *$1); delete $1; $$->loc = yyloc; }
+  | IDENT COLONCOLON IDENT { $$ = new EmbedExpr(*$1, *$3); delete $1; delete $3; $$->loc = yyloc; }
+  | '!' IDENT { string t; $$ = new CollapseExpr(t, *$2); delete $2; $$->loc = yyloc; }
+  | '!' IDENT COLONCOLON IDENT { $$ = new CollapseExpr(*$2, *$4); delete $2; delete $4; $$->loc = yyloc; }
+  | '&' IDENT { string t; $$ = new CallExpr(t, *$2); delete $2; $$->loc = yyloc; }
+  | '&' IDENT COLONCOLON IDENT { $$ = new CallExpr(*$2, *$4); delete $2; delete $4; $$->loc = yyloc; }
+  | STRING_LITERAL { $$ = new LiteralExpr(*$1); delete $1; $$->loc = yyloc; }
+  | '.' { $$ = new DotExpr(); $$->loc = yyloc; }
+  | INTEGER {
+      if (opt_bytes && 256 <= $1) {
+        FAIL(yyloc, "literal integers should be less than 256 in bytes mode");
+        $$ = new DotExpr;
+      } else {
+        auto t = new DisjointIntervals;
+        t->emplace($1, $1+1);
+        $$ = new BracketExpr(t);
+        $$->loc = yyloc;
+      }
+    }
+  | bracket { $$ = new BracketExpr($1); $$->loc = yyloc; }
+  | STRING_LITERAL DOTDOT STRING_LITERAL {
+      i32 c0, c1, i = 0, j = 0;
+      if (opt_bytes) {
+        c0 = u8((*$1)[0]);
+        c1 = u8((*$3)[0]);
+        i = j = 1;
+      } else {
+        U8_NEXT($1->c_str(), i, $1->size(), c0);
+        U8_NEXT($3->c_str(), j, $3->size(), c1);
+      }
+      delete $1;
+      delete $3;
+      if (i != $1->size() || j != $3->size()) {
+        FAIL(yyloc, "endpoints of Unicode range should be of length 1");
+        $$ = new DotExpr;
+      } else if (c0 > c1) {
+        FAIL(yyloc, "negative Unicode range");
+        $$ = new DotExpr;
+      } else {
+        auto t = new DisjointIntervals;
+        t->emplace(c0, c1+1);
+        $$ = new BracketExpr(t);
+        $$->loc = yyloc;
+      }
+    }
+  | '(' union_expr ')' { $$ = $2; }
+  | '(' error ')' { $$ = new DotExpr; }
+  | repeat { $$ = $1; }
+  | factor '>' action { $$ = $1; $$->entering.emplace_back($3, 0L); }
+  | factor '>' INTEGER action { $$ = $1; $$->entering.emplace_back($4, $3); }
+  | factor '@' action { $$ = $1; $$->finishing.emplace_back($3, 0L); }
+  | factor '@' INTEGER action { $$ = $1; $$->finishing.emplace_back($4, $3); }
+  | factor '%' action { $$ = $1; $$->leaving.emplace_back($3, 0L); }
+  | factor '%' INTEGER action { $$ = $1; $$->leaving.emplace_back($4, $3); }
+  | factor '$' action { $$ = $1; $$->transiting.emplace_back($3, 0L); }
+  | factor '$' INTEGER action { $$ = $1; $$->transiting.emplace_back($4, $3); }
+  | factor '+' { $$ = new PlusExpr($1); $$->loc = yyloc; }
+  | factor '?' { $$ = new QuestionExpr($1); $$->loc = yyloc; }
+  | factor '*' { $$ = new StarExpr($1); $$->loc = yyloc; }
+
+repeat:
+    factor '{' INTEGER ',' INTEGER '}' { gen_repeat($$, $1, $3, $5); $$->loc = yyloc; }
+  | factor '{' INTEGER ',' '}' { gen_repeat($$, $1, $3, LONG_MAX); $$->loc = yyloc; }
+  | factor '{' INTEGER '}' { gen_repeat($$, $1, $3, $3); $$->loc = yyloc; }
+  | factor '{' ',' INTEGER '}' { gen_repeat($$, $1, 0, $4); $$->loc = yyloc; }
+
+action:
+    IDENT { string t; $$ = new RefAction(t, *$1); delete $1; $$->loc = yyloc; }
+  | IDENT COLONCOLON IDENT { $$ = new RefAction(*$1, *$3); delete $1; delete $3; $$->loc = yyloc; }
+  | BRACED_CODE { $$ = new InlineAction(*$1); delete $1; $$->loc = yyloc; }
+
+bracket:
+    '[' bracket_items ']' { $$ = $2; }
+  | '[' '^' bracket_items ']' {
+      $$ = $3;
+      $$->flip();
+    }
+
+bracket_items:
+    bracket_items CHAR '-' CHAR {
+      $$ = $1;
+      if ($2 > $4)
+        FAIL(yyloc, "negative range in character class");
+      else
+        $$->emplace($2, $4+1);
+    }
+  | bracket_items CHAR {
+      $$ = $1;
+      $$->emplace($2, $2+1);
+    }
+  | %empty { $$ = new DisjointIntervals; }
+
+%%
+
+int parse(const LocationFile& locfile, Stmt*& res)
+{
+  yyscan_t lexer;
+  raw_yylex_init_extra(0, &lexer);
+  YY_BUFFER_STATE buf = raw_yy_scan_bytes(locfile.data.c_str(), locfile.data.size(), lexer);
+  long errors = 0;
+  yyparse(res, errors, locfile, &lexer);
+  raw_yy_delete_buffer(buf, lexer);
+  raw_yylex_destroy(lexer);
+  if (errors > 0) {
+    stmt_free(res);
+    res = NULL;
+  }
+  return errors;
+}

+ 353 - 0
yanshi/src/repl.cc

@@ -0,0 +1,353 @@
+#include "compiler.hh"
+#include "fsa_anno.hh"
+#include "loader.hh"
+#include "parser.hh"
+#include "lexer.hh" // after parser.hh
+#include "syntax.hh"
+
+#include <algorithm>
+#include <assert.h>
+#include <functional>
+#include <inttypes.h>
+#include <sstream>
+#include <stdlib.h>
+#include <type_traits>
+#include <unicode/utf8.h>
+#include <unordered_map>
+#include <wctype.h>
+#ifdef HAVE_READLINE
+# include <readline/readline.h>
+# include <readline/history.h>
+#endif
+using namespace std;
+
+enum class ReplMode {string, integer};
+static ReplMode mode = ReplMode::string;
+static const FsaAnno* anno;
+static bool quit;
+
+struct Command
+{
+  const char* name;
+  function<void(const char*)> fn;
+} commands[] = {
+  {".automaton", [](const char*) {print_automaton(anno->fsa); }},
+  {".assoc", [](const char*) {print_assoc(*anno); }},
+  {".help",
+    [](const char*) {
+      fputs("Commands available from the prompt:\n"
+             "  .automaton    dump automaton\n"
+             "  .assoc        dump associated AST Expr for each state\n"
+             "  .help         display this help\n"
+             "  .integer      input is a list of non-negative integers, macros(#define) or '' "" quoted strings\n"
+             "  .macro        display defined macros\n"
+             "  .string       input is a string\n"
+             "  .stmt <ident> change target DefineStmt to <ident>\n"
+             "  .quit         exit interactive mode\n"
+             , stdout);
+    }},
+  {".integer",
+    [](const char*) {
+      mode = ReplMode::integer; puts(".integer mode");
+    }},
+  {".macro",
+    [](const char*) {
+      for (auto& it: main_module->macro)
+        printf("%s\t%ld\n", it.first.c_str(), it.second->value);
+      for (auto* import: main_module->unqualified_import)
+        for (auto& it: import->macro)
+          printf("%s\t%ld\n", it.first.c_str(), it.second->value);
+    }},
+  {".stmt",
+    [](const char* arg) {
+      Stmt* r = resolve(*main_module, "", arg);
+      if (! r)
+        printf("'%s' undefined\n", arg);
+      else if (r == (Stmt*)1)
+        printf("ambiguous '%s'\n", arg);
+      else if (auto d = dynamic_cast<PreprocessDefineStmt*>(r))
+        printf("'%s' is a macro\n", arg);
+      else if (auto d = dynamic_cast<DefineStmt*>(r)) {
+        anno = &compiled[d];
+        printf("%s :: DefineStmt\n", d->lhs.c_str());
+      } else
+        assert(0);
+    }},
+  {".string",
+    [](const char*) {
+      mode = ReplMode::string; puts(".string mode");
+    }
+  },
+  {".quit",
+    [](const char*) {
+      puts("Leaving interactive mode");
+      quit = true;
+    }
+  },
+};
+
+#ifdef HAVE_READLINE
+static char* command_completer(const char* text, int state)
+{
+  static long i = 0;
+  if (! state)
+    i = 0;
+  while (i < LEN(commands)) {
+    Command* x = &commands[i++];
+    if (! strncmp(x->name, text, strlen(text)))
+      return strdup(x->name);
+  }
+  return NULL;
+}
+
+static char* macro_completer(const char* text, int state)
+{
+  static Stmt* x;
+  if (! state)
+    x = main_module->toplevel;
+  while (x) {
+    auto xx = dynamic_cast<PreprocessDefineStmt*>(x);
+    x = x->next;
+    if (xx && ! strncmp(xx->ident.c_str(), text, strlen(text)))
+      return strdup(xx->ident.c_str());
+  }
+  return NULL;
+}
+
+static char* stmt_completer(const char* text, int state)
+{
+  static Stmt* x;
+  if (! state)
+    x = main_module->toplevel;
+  while (x) {
+    auto xx = dynamic_cast<DefineStmt*>(x);
+    x = x->next;
+    if (xx && ! strncmp(xx->lhs.c_str(), text, strlen(text)))
+      return strdup(xx->lhs.c_str());
+  }
+  return NULL;
+}
+
+static char** on_complete(const char* text, int start, int end)
+{
+  rl_attempted_completion_over = 1;
+  if (! start)
+    return rl_completion_matches(text, command_completer);
+  if (6 <= start && ! strncmp(rl_line_buffer, ".stmt ", 6))
+    return rl_completion_matches(text, stmt_completer);
+  if (mode == ReplMode::integer)
+    return rl_completion_matches(text, macro_completer);
+  return NULL;
+}
+#else
+char* readline(const char* prompt)
+{
+  char* r = NULL;
+  size_t s = 0;
+  ssize_t n;
+  fputs(prompt, stdout);
+  if ((n = getline(&r, &s, stdin)) > 0)
+    r[n-1] = '\0';
+  else {
+    free(r);
+    r = NULL;
+  }
+  return r;
+}
+#endif
+
+static void run_command(char* line)
+{
+  size_t p = 1;
+  while (line[p] && isalnum(line[p]))
+    p++;
+  Command* com = NULL;
+  REP(i, LEN(commands))
+    if (! strncmp(commands[i].name, line, p)) {
+      if (com) com = (Command*)1;
+      else com = &commands[i];
+    }
+  if (! com)
+    printf("Unknown command '%s'\n", line);
+  else if (com == (Command*)1)
+    printf("Ambiguous command '%s'\n", line);
+  else {
+    while (line[p] && isspace(line[p]))
+      p++;
+    size_t len = strlen(line);
+    while (len && isspace(line[len-1]))
+      line[--len] = '\0';
+    com->fn(line+p);
+  }
+}
+
+void repl(DefineStmt* stmt)
+{
+#ifdef HAVE_READLINE
+  rl_attempted_completion_function = on_complete;
+#endif
+  char buf[BUF_SIZE];
+  snprintf(buf, sizeof buf, ".stmt %s", stmt->lhs.c_str());
+  run_command(buf);
+  strcpy(buf, ".integer");
+  run_command(buf);
+  strcpy(buf, ".help");
+  run_command(buf);
+  if (! anno) return;
+  char* line;
+  stringstream ss;
+  while ((line = readline("λ ")) != NULL) {
+#ifdef HAVE_READLINE
+    if (line[0])
+      add_history(line);
+#endif
+    if (line[0] == '.') {
+      run_command(line);
+      free(line);
+      if (quit) break;
+      continue;
+    }
+
+    long u = anno->fsa.start;
+
+    if (mode == ReplMode::string) {
+      i32 i = 0, len;
+      long c;
+      len = strlen(line);
+      if (anno->fsa.is_final(u)) yellow(1);
+      else normal_yellow(1);
+      printf("%ld ", u); sgr0();
+      while (i < len) {
+        U8_NEXT_OR_FFFD(line, i, len, c);
+        if (iswcntrl(c)) printf("%ld ", c);
+        else printf("%lc ", wint_t(c));
+        u = anno->fsa.transit(u, c);
+        if (anno->fsa.is_final(u)) yellow();
+        else normal_yellow();
+        printf("%ld ", u); sgr0();
+        if (u < 0) break;
+      }
+    } else {
+      vector<long> input;
+      int token;
+      yyscan_t lexer;
+      raw_yylex_init_extra(0, &lexer);
+      YY_BUFFER_STATE buf = raw_yy_scan_bytes(line, strlen(line), lexer);
+      YYSTYPE yylval;
+      YYLTYPE yylloc;
+      while (u >= 0 && (token = raw_yylex(&yylval, &yylloc, lexer)) != 0) {
+        switch (token) { // all tokens with a destructor should be listed
+        case IDENT: {
+          Stmt* stmt = resolve(*main_module, "", yylval.str->c_str());
+          if (! stmt) {
+            printf("'%s' undefined", yylval.str->c_str());
+            u = -1;
+          } else if (stmt == (Stmt*)1) {
+            printf("ambiguous '%s'", yylval.str->c_str());
+            u = -1;
+          } else if (auto d = dynamic_cast<PreprocessDefineStmt*>(stmt))
+            input.push_back(d->value);
+          else if (auto d = dynamic_cast<DefineStmt*>(stmt)) {
+            printf("'%s' is not a macro", yylval.str->c_str());
+            u = -1;
+          } else
+            assert(0);
+          delete yylval.str;
+          break;
+        }
+        case INTEGER:
+          input.push_back(yylval.integer);
+          break;
+        case STRING_LITERAL:
+          if (opt_bytes)
+            for (unsigned char c: *yylval.str)
+              input.push_back(c);
+          else
+            for (i32 c, i = 0; i < yylval.str->size(); ) {
+              U8_NEXT_OR_FFFD(yylval.str->c_str(), i, yylval.str->size(), c);
+              input.push_back(c);
+            }
+          delete yylval.str;
+          break;
+        case BRACED_CODE:
+          delete yylval.str;
+          // fall through
+        default:
+          printf("invalid token at column %ld-%ld\n", yylloc.start+1, yylloc.end);
+          u = -1;
+          break;
+        }
+      }
+      raw_yy_delete_buffer(buf, lexer);
+      raw_yylex_destroy(lexer);
+
+      if (u >= 0) {
+        if (anno->fsa.is_final(u)) yellow(1);
+        else normal_yellow(1);
+        printf("%ld ", u); sgr0();
+        for (long c: input) {
+          printf("%ld ", c);
+          u = anno->fsa.transit(u, c);
+          if (anno->fsa.is_final(u)) yellow();
+          else normal_yellow();
+          printf("%ld ", u); sgr0();
+          if (u < 0) break;
+        }
+      }
+    }
+    free(line);
+    puts("");
+    if (u >= 0) {
+      unordered_map<DefineStmt*, vector<long>> start_finals;
+      unordered_map<DefineStmt*, vector<pair<long, long>>> inners;
+      for (auto aa: anno->assoc[u]) {
+        if (has_start(aa.second))
+          start_finals[aa.first->stmt].push_back(aa.first->loc.start);
+        if (has_inner(aa.second)) {
+          inners[aa.first->stmt].emplace_back(aa.first->loc.start, 1);
+          inners[aa.first->stmt].emplace_back(aa.first->loc.end, -1);
+        }
+        if (has_final(aa.second))
+          start_finals[aa.first->stmt].push_back(aa.first->loc.end);
+      }
+      vector<DefineStmt*> stmts;
+      for (auto& it: start_finals)
+        stmts.push_back(it.first);
+      for (auto& it: inners)
+        stmts.push_back(it.first);
+      // sort DefineStmt by location
+      sort(ALL(stmts), [](const DefineStmt* x, const DefineStmt* y) {
+        if (x->module != y->module)
+          return x->module < y->module;
+        if (x->loc.start != y->loc.start)
+          return x->loc.start < y->loc.start;
+        return x->loc.end < y->loc.end;
+      });
+      stmts.erase(unique(ALL(stmts)), stmts.end());
+      for (auto* stmt: stmts) {
+        auto& start_final = start_finals[stmt];
+        auto& inner = inners[stmt];
+        sort(ALL(start_final));
+        sort(ALL(inner));
+        auto it0 = start_final.begin();
+        auto it1 = inner.begin();
+        long nest = 0;
+        FOR(i, stmt->loc.start, stmt->loc.end) {
+          for (; it0 != start_final.end() && *it0 < i; ++it0);
+          if (it0 != start_final.end() && *it0 == i) {
+            cyan();
+            putchar(':');
+            sgr0();
+          }
+          for (; it1 != inner.end() && it1->first <= i; ++it1)
+            nest += it1->second;
+          if (nest)
+            yellow();
+          putchar(stmt->module->locfile.data[i]);
+          if (nest)
+            sgr0();
+        }
+      }
+    }
+  }
+}

+ 4 - 0
yanshi/src/repl.hh

@@ -0,0 +1,4 @@
+#pragma once
+#include "syntax.hh"
+
+void repl(DefineStmt* stmt);

+ 10 - 0
yanshi/src/syntax.cc

@@ -0,0 +1,10 @@
+#include "syntax.hh"
+
+void stmt_free(Stmt* stmt)
+{
+  while (stmt) {
+    auto x = stmt->next;
+    delete stmt;
+    stmt = x;
+  }
+}

+ 577 - 0
yanshi/src/syntax.hh

@@ -0,0 +1,577 @@
+#pragma once
+#include "common.hh"
+#include "location.hh"
+
+#include <cxxabi.h>
+#include <memory>
+#include <string.h>
+#include <string>
+#include <typeinfo>
+#include <vector>
+using std::move;
+using std::pair;
+using std::string;
+using std::vector;
+
+//// Visitor
+
+template<class T>
+struct Visitor;
+
+template<class T>
+struct VisitableBase {
+  virtual void accept(Visitor<T>& visitor) = 0;
+};
+
+template<class Base, class Derived>
+struct Visitable : Base {
+  void accept(Visitor<Base>& visitor) override {
+    visitor.visit(static_cast<Derived&>(*this));
+  }
+};
+
+struct Action;
+struct InlineAction;
+struct RefAction;
+template<>
+struct Visitor<Action> {
+  virtual void visit(Action& action) = 0;
+  virtual void visit(InlineAction&) = 0;
+  virtual void visit(RefAction&) = 0;
+};
+
+struct Expr;
+struct BracketExpr;
+struct CallExpr;
+struct CollapseExpr;
+struct ComplementExpr;
+struct ConcatExpr;
+struct DifferenceExpr;
+struct DotExpr;
+struct EmbedExpr;
+struct EpsilonExpr;
+struct IntersectExpr;
+struct LiteralExpr;
+struct PlusExpr;
+struct RepeatExpr;
+struct QuestionExpr;
+struct StarExpr;
+struct UnionExpr;
+template<>
+struct Visitor<Expr> {
+  virtual void visit(Expr&) = 0;
+  virtual void visit(BracketExpr&) = 0;
+  virtual void visit(CallExpr&) = 0;
+  virtual void visit(CollapseExpr&) = 0;
+  virtual void visit(ComplementExpr&) = 0;
+  virtual void visit(ConcatExpr&) = 0;
+  virtual void visit(DifferenceExpr&) = 0;
+  virtual void visit(DotExpr&) = 0;
+  virtual void visit(EmbedExpr&) = 0;
+  virtual void visit(EpsilonExpr&) = 0;
+  virtual void visit(IntersectExpr&) = 0;
+  virtual void visit(LiteralExpr&) = 0;
+  virtual void visit(PlusExpr&) = 0;
+  virtual void visit(RepeatExpr&) = 0;
+  virtual void visit(QuestionExpr&) = 0;
+  virtual void visit(StarExpr&) = 0;
+  virtual void visit(UnionExpr&) = 0;
+};
+
+struct Stmt;
+struct ActionStmt;
+struct CppStmt;
+struct DefineStmt;
+struct EmptyStmt;
+struct ImportStmt;
+struct PreprocessDefineStmt;
+template<>
+struct Visitor<Stmt> {
+  virtual void visit(Stmt&) = 0;
+  virtual void visit(ActionStmt&) = 0;
+  virtual void visit(CppStmt&) = 0;
+  virtual void visit(DefineStmt&) = 0;
+  virtual void visit(EmptyStmt&) = 0;
+  virtual void visit(ImportStmt&) = 0;
+  virtual void visit(PreprocessDefineStmt&) = 0;
+};
+
+//// Action
+
+struct Action : VisitableBase<Action> {
+  Location loc;
+  virtual ~Action() = default;
+};
+
+struct InlineAction : Visitable<Action, InlineAction> {
+  string code;
+  InlineAction(string& code) : code(move(code)) {}
+};
+
+struct Module;
+struct RefAction : Visitable<Action, RefAction> {
+  string qualified, ident;
+  ActionStmt* define_stmt; // set by ModuleUse
+  RefAction(string& qualified, string& ident) : qualified(move(qualified)), ident(move(ident)) {}
+};
+
+//// Expr
+
+struct Expr : VisitableBase<Expr> {
+  Location loc;
+  long pre, post, depth; // set by Compiler
+  vector<Expr*> anc; // set by Compiler
+  vector<pair<Action*, long>> entering, finishing, leaving, transiting;
+  DefineStmt* stmt = NULL; // set by ModuleImportDef
+  virtual ~Expr() {
+    for (auto a: entering)
+      delete a.first;
+    for (auto a: finishing)
+      delete a.first;
+    for (auto a: leaving)
+      delete a.first;
+    for (auto a: transiting)
+      delete a.first;
+  }
+  string name() const {
+    int status;
+    std::unique_ptr<char, void(*)(void*)> r{
+      abi::__cxa_demangle(typeid(*this).name(), NULL, NULL, &status),
+      free
+    };
+    std::string t = r.get();
+    t = t.substr(0, t.size()-4); // suffix 'Expr'
+    return t;
+  }
+  bool no_action() const {
+    return entering.empty() && finishing.empty() && leaving.empty() && transiting.empty();
+  }
+};
+
+struct BracketExpr : Visitable<Expr, BracketExpr> {
+  DisjointIntervals intervals;
+  BracketExpr(DisjointIntervals* intervals) : intervals(std::move(*intervals)) { delete intervals; }
+};
+
+struct CallExpr : Visitable<Expr, CallExpr> {
+  string qualified, ident;
+  DefineStmt* define_stmt = NULL; // set by ModuleUse
+  CallExpr(string& qualified, string& ident) : qualified(move(qualified)), ident(move(ident)) {}
+};
+
+struct CollapseExpr : Visitable<Expr, CollapseExpr> {
+  string qualified, ident;
+  DefineStmt* define_stmt = NULL; // set by ModuleUse
+  CollapseExpr(string& qualified, string& ident) : qualified(move(qualified)), ident(move(ident)) {}
+};
+
+struct ComplementExpr : Visitable<Expr, ComplementExpr> {
+  Expr* inner;
+  ComplementExpr(Expr* inner) : inner(inner) {}
+  ~ComplementExpr() {
+    delete inner;
+  }
+};
+
+struct ConcatExpr : Visitable<Expr, ConcatExpr> {
+  Expr *lhs, *rhs;
+  ConcatExpr(Expr* lhs, Expr* rhs) : lhs(lhs), rhs(rhs) {}
+  ~ConcatExpr() {
+    delete lhs;
+    delete rhs;
+  }
+};
+
+struct DifferenceExpr : Visitable<Expr, DifferenceExpr> {
+  Expr *lhs, *rhs;
+  DifferenceExpr(Expr* lhs, Expr* rhs) : lhs(lhs), rhs(rhs) {}
+  ~DifferenceExpr() {
+    delete lhs;
+    delete rhs;
+  }
+};
+
+struct DotExpr : Visitable<Expr, DotExpr> {};
+
+struct EmbedExpr : Visitable<Expr, EmbedExpr> {
+  string qualified, ident;
+  DefineStmt* define_stmt = NULL; // set by ModuleUse
+  long macro_value; // set by ModuleUse
+  EmbedExpr(string& qualified, string& ident) : qualified(move(qualified)), ident(move(ident)) {}
+};
+
+struct EpsilonExpr : Visitable<Expr, EpsilonExpr> {};
+
+struct IntersectExpr : Visitable<Expr, IntersectExpr> {
+  Expr *lhs, *rhs;
+  IntersectExpr(Expr* lhs, Expr* rhs) : lhs(lhs), rhs(rhs) {}
+  ~IntersectExpr() {
+    delete lhs;
+    delete rhs;
+  }
+};
+
+struct LiteralExpr : Visitable<Expr, LiteralExpr> {
+  string literal;
+  LiteralExpr(string& literal) : literal(move(literal)) {}
+};
+
+struct PlusExpr : Visitable<Expr, PlusExpr> {
+  Expr* inner;
+  PlusExpr(Expr* inner) : inner(inner) {}
+  ~PlusExpr() {
+    delete inner;
+  }
+};
+
+struct RepeatExpr : Visitable<Expr, RepeatExpr> {
+  Expr* inner;
+  long low, high;
+  RepeatExpr(Expr* inner, long low, long high) : inner(inner), low(low), high(high) {}
+  ~RepeatExpr() {
+    delete inner;
+  }
+};
+
+struct QuestionExpr : Visitable<Expr, QuestionExpr> {
+  Expr* inner;
+  QuestionExpr(Expr* inner) : inner(inner) {}
+  ~QuestionExpr() {
+    delete inner;
+  }
+};
+
+struct StarExpr : Visitable<Expr, StarExpr> {
+  Expr* inner;
+  StarExpr(Expr* inner) : inner(inner) {}
+  ~StarExpr() {
+    delete inner;
+  }
+};
+
+struct UnionExpr : Visitable<Expr, UnionExpr> {
+  Expr *lhs, *rhs;
+  UnionExpr(Expr* lhs, Expr* rhs) : lhs(lhs), rhs(rhs) {}
+  ~UnionExpr() {
+    delete lhs;
+    delete rhs;
+  }
+};
+
+//// Stmt
+
+struct Stmt {
+  Location loc;
+  Stmt *prev = NULL, *next = NULL;
+  virtual ~Stmt() = default;
+  virtual void accept(Visitor<Stmt>& visitor) = 0;
+};
+
+struct EmptyStmt : Visitable<Stmt, EmptyStmt> {};
+
+struct ActionStmt : Visitable<Stmt, ActionStmt> {
+  string ident, code;
+  ActionStmt(string& ident, string& code) : ident(move(ident)), code(move(code)) {}
+};
+
+struct CppStmt : Visitable<Stmt, CppStmt> {
+  string code;
+  CppStmt(string& code) : code(move(code)) {}
+};
+
+struct DefineStmt : Visitable<Stmt, DefineStmt> {
+  bool export_ = false, intact = false;
+  string export_params, lhs;
+  Expr* rhs;
+  Module* module; // used in topological sort
+  DefineStmt(string& lhs, Expr* rhs) : lhs(move(lhs)), rhs(rhs) {}
+  ~DefineStmt() {
+    delete rhs;
+  }
+};
+
+struct ImportStmt : Visitable<Stmt, ImportStmt> {
+  string filename, qualified;
+  ImportStmt(string& filename, string& qualified) : filename(move(filename)), qualified(move(qualified)) {}
+};
+
+struct PreprocessDefineStmt : Visitable<Stmt, PreprocessDefineStmt> {
+  string ident;
+  long value;
+  PreprocessDefineStmt(string& ident, long value) : ident(move(ident)), value(value) {}
+};
+
+void stmt_free(Stmt* stmt);
+
+//// Visitor implementations
+
+struct StmtPrinter : Visitor<Action>, Visitor<Expr>, Visitor<Stmt> {
+  int depth = 0;
+
+  void visit(Action& action) override {
+    action.accept(*this);
+  }
+  void visit(InlineAction& action) override {
+    printf("%*s%s\n", 2*depth, "", "InlineAction");
+    printf("%*s%s\n", 2*(depth+1), "", action.code.c_str());
+  }
+  void visit(RefAction& action) override {
+    printf("%*s%s\n", 2*depth, "", "RefAction");
+    printf("%*s%s\n", 2*(depth+1), "", action.ident.c_str());
+  }
+
+  void visit(Expr& expr) override {
+    if (expr.entering.size()) {
+      printf("%*s%s\n", 2*depth, "", "@entering");
+      depth++;
+      for (auto a: expr.entering) {
+        indent(stdout, depth);
+        printf("%ld\n", a.second);
+        a.first->accept(*this);
+      }
+      depth--;
+    }
+    if (expr.finishing.size()) {
+      printf("%*s%s\n", 2*depth, "", "@finishing");
+      depth++;
+      for (auto a: expr.finishing) {
+        indent(stdout, depth);
+        printf("%ld\n", a.second);
+        a.first->accept(*this);
+      }
+      depth--;
+    }
+    if (expr.leaving.size()) {
+      printf("%*s%s\n", 2*depth, "", "@entering");
+      depth++;
+      for (auto a: expr.leaving) {
+        indent(stdout, depth);
+        printf("%ld\n", a.second);
+        a.first->accept(*this);
+      }
+      depth--;
+    }
+    if (expr.transiting.size()) {
+      printf("%*s%s\n", 2*depth, "", "@transiting");
+      depth++;
+      for (auto a: expr.transiting) {
+        indent(stdout, depth);
+        printf("%ld\n", a.second);
+        a.first->accept(*this);
+      }
+      depth--;
+    }
+    expr.accept(*this);
+  }
+  void visit(BracketExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "BracketExpr");
+    printf("%*s", 2*(depth+1), "");
+    for (auto& x: expr.intervals.to)
+      printf("(%ld,%ld) ", x.first, x.second);
+    puts("");
+  }
+  void visit(CallExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "CallExpr");
+    printf("%*s", 2*(depth+1), "");
+    if (expr.qualified.size())
+      printf("%s::%s\n", expr.qualified.c_str(), expr.ident.c_str());
+    else
+      printf("%s\n", expr.ident.c_str());
+  }
+  void visit(CollapseExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "CollapseExpr");
+    printf("%*s", 2*(depth+1), "");
+    if (expr.qualified.size())
+      printf("%s::%s\n", expr.qualified.c_str(), expr.ident.c_str());
+    else
+      printf("%s\n", expr.ident.c_str());
+  }
+  void visit(ComplementExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "ComplementExpr");
+    depth++;
+    visit(*expr.inner);
+    depth--;
+  }
+  void visit(ConcatExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "ConcatExpr");
+    depth++;
+    visit(*expr.lhs);
+    visit(*expr.rhs);
+    depth--;
+  }
+  void visit(DifferenceExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "DifferenceExpr");
+    depth++;
+    visit(*expr.lhs);
+    visit(*expr.rhs);
+    depth--;
+  }
+  void visit(DotExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "DotExpr");
+  }
+  void visit(EmbedExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "EmbedExpr");
+    printf("%*s", 2*(depth+1), "");
+    if (expr.qualified.size())
+      printf("%s::%s\n", expr.qualified.c_str(), expr.ident.c_str());
+    else
+      printf("%s\n", expr.ident.c_str());
+  }
+  void visit(EpsilonExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "EpsilonExpr");
+  }
+  void visit(IntersectExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "IntersectExpr");
+    depth++;
+    visit(*expr.lhs);
+    visit(*expr.rhs);
+    depth--;
+  }
+  void visit(LiteralExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "LiteralExpr");
+    printf("%*s%s\n", 2*(depth+1), "", expr.literal.c_str());
+  }
+  void visit(PlusExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "PlusExpr");
+    depth++;
+    visit(*expr.inner);
+    depth--;
+  }
+  void visit(RepeatExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "RepeatExpr");
+    printf("%*s%ld,%ld\n", 2*(depth+1), "", expr.low, expr.high);
+    depth++;
+    visit(*expr.inner);
+    depth--;
+  }
+  void visit(QuestionExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "QuestionExpr");
+    depth++;
+    visit(*expr.inner);
+    depth--;
+  }
+  void visit(StarExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "StarExpr");
+    depth++;
+    visit(*expr.inner);
+    depth--;
+  }
+  void visit(UnionExpr& expr) override {
+    printf("%*s%s\n", 2*depth, "", "UnionExpr");
+    depth++;
+    visit(*expr.lhs);
+    visit(*expr.rhs);
+    depth--;
+  }
+
+  void visit(Stmt& stmt) override {
+    stmt.accept(*this);
+  }
+  void visit(ActionStmt& stmt) override {
+    printf("%*s%s\n", 2*depth, "", "ActionStmt");
+    printf("%*s%s\n", 2*(depth+1), "", stmt.ident.c_str());
+    printf("%*s%s\n", 2*(depth+1), "", stmt.code.c_str());
+  }
+  void visit(CppStmt& stmt) override {
+    printf("%*s%s\n", 2*depth, "", "CppStmt");
+    printf("%*s%s\n", 2*(depth+1), "", stmt.code.c_str());
+  }
+  void visit(DefineStmt& stmt) override {
+    printf("%*s%s%s\n", 2*depth, "", "DefineStmt", stmt.export_ ? " export" : "");
+    depth++;
+    indent(stdout, depth);
+    if (stmt.export_params.size())
+      printf("(%s) ", stmt.export_params.c_str());
+    printf("%s\n", stmt.lhs.c_str());
+    visit(*stmt.rhs);
+    depth--;
+  }
+  void visit(EmptyStmt& stmt) override {
+    printf("%*s%s\n", 2*depth, "", "EmptyStmt");
+  }
+  void visit(ImportStmt& stmt) override {
+    printf("%*s%s\n", 2*depth, "", "ImportStmt");
+    printf("%*s%s\n", 2*(depth+1), "", stmt.filename.c_str());
+    if (stmt.qualified.size())
+      printf("%*sas %s\n", 2*(depth+1), "", stmt.qualified.c_str());
+  }
+  void visit(PreprocessDefineStmt& stmt) override {
+    printf("%*s%s\n", 2*depth, "", "PreprocessDefineStmt");
+    printf("%*s%s %ld\n", 2*(depth+1), "", stmt.ident.c_str(), stmt.value);
+  }
+};
+
+//// Visitor implementation
+
+struct PreorderStmtVisitor : Visitor<Stmt> {
+  void visit(Stmt& stmt) override { stmt.accept(*this); }
+  void visit(ActionStmt& stmt) override {}
+  void visit(CppStmt& stmt) override {}
+  void visit(DefineStmt& stmt) override {}
+  void visit(EmptyStmt& stmt) override {}
+  void visit(ImportStmt& stmt) override {}
+  void visit(PreprocessDefineStmt&) override {}
+};
+
+struct PrePostActionExprStmtVisitor : Visitor<Action>, Visitor<Expr>, Visitor<Stmt> {
+  virtual void pre_action(Action&) {}
+  virtual void post_action(Action&) {}
+  virtual void pre_expr(Expr&) {}
+  virtual void post_expr(Expr&) {}
+  virtual void pre_stmt(Stmt&) {}
+  virtual void post_stmt(Stmt&) {}
+
+  void visit(Action& action) override {
+    pre_action(action);
+    action.accept(*this);
+    post_action(action);
+  }
+  void visit(InlineAction&) override {}
+  void visit(RefAction&) override {}
+
+  void visit(Expr& expr) override {
+    pre_expr(expr);
+    expr.accept(*this);
+    post_expr(expr);
+  }
+  void visit(BracketExpr& expr) override {}
+  void visit(CallExpr& expr) override {}
+  void visit(CollapseExpr& expr) override {}
+  void visit(ComplementExpr& expr) override { visit(*expr.inner); }
+  void visit(ConcatExpr& expr) override {
+    visit(*expr.lhs);
+    visit(*expr.rhs);
+  }
+  void visit(DifferenceExpr& expr) override {
+    visit(*expr.lhs);
+    visit(*expr.rhs);
+  }
+  void visit(DotExpr& expr) override {}
+  void visit(EmbedExpr& expr) override {}
+  void visit(EpsilonExpr& expr) override {}
+  void visit(IntersectExpr& expr) override {
+    visit(*expr.lhs);
+    visit(*expr.rhs);
+  }
+  void visit(LiteralExpr& expr) override {}
+  void visit(PlusExpr& expr) override { visit(*expr.inner); }
+  void visit(RepeatExpr& expr) override { visit(*expr.inner); }
+  void visit(QuestionExpr& expr) override { visit(*expr.inner); }
+  void visit(StarExpr& expr) override { visit(*expr.inner); }
+  void visit(UnionExpr& expr) override {
+    visit(*expr.lhs);
+    visit(*expr.rhs);
+  }
+
+  void visit(Stmt& stmt) override {
+    pre_stmt(stmt);
+    stmt.accept(*this);
+    post_stmt(stmt);
+  }
+  void visit(ActionStmt& stmt) override {}
+  void visit(CppStmt& stmt) override {}
+  void visit(DefineStmt& stmt) override { stmt.rhs->accept(*this); }
+  void visit(EmptyStmt& stmt) override {}
+  void visit(ImportStmt& stmt) override {}
+  void visit(PreprocessDefineStmt&) override {}
+};

+ 41 - 0
yanshi/unittest/determinize_test.cc

@@ -0,0 +1,41 @@
+#include "fsa.hh"
+#include "unittest/unittest_helper.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <type_traits>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+using namespace std;
+
+const char test[] =
+"4 7 2\n"
+"2 3  \n"
+"0 0 1\n"
+"0 -1 2\n"
+"1 1 1\n"
+"1 1 3\n"
+"2 -1 1\n"
+"2 0 3\n"
+"3 0 2\n"
+;
+
+int main(int argc, char *argv[])
+{
+  if (argc == 1) {
+    char filename[] = "/tmp/XXXXXX";
+    int fd = mkstemp(filename);
+    write(fd, test, sizeof test-1);
+    close(fd);
+    freopen(filename, "r", stdin);
+    unlink(filename);
+  }
+
+  auto relate = [](const vector<long>&) {};
+  Fsa fsa = read_nfa().determinize(relate);
+  print_fsa(fsa);
+
+  if (argc == 1)
+    return fsa.n() == 4 ? 0 : 1;
+}

+ 44 - 0
yanshi/unittest/difference_test.cc

@@ -0,0 +1,44 @@
+#include "fsa.hh"
+#include "unittest/unittest_helper.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+using namespace std;
+
+const char test[] =
+"4 4 1\n"
+"3  \n"
+"0 0 1\n"
+"0 1 2\n"
+"1 0 3\n"
+"2 1 3\n"
+
+"4 4 1\n"
+"3  \n"
+"0 0 1\n"
+"0 1 2\n"
+"1 1 3\n"
+"2 1 3\n"
+;
+
+int main(int argc, char *argv[])
+{
+  if (argc == 1) {
+    char filename[] = "/tmp/XXXXXX";
+    int fd = mkstemp(filename);
+    write(fd, test, sizeof test-1);
+    close(fd);
+    freopen(filename, "r", stdin);
+    unlink(filename);
+  }
+
+  auto relate = [](long u, long v) {};
+  Fsa a = read_dfa(), b = read_dfa(), fsa = a.difference(b, relate);
+  print_fsa(fsa);
+
+  if (argc == 1)
+    return 0; // fsa.n() == 4 ? 0 : 1;
+}

+ 44 - 0
yanshi/unittest/intersection_test.cc

@@ -0,0 +1,44 @@
+#include "fsa.hh"
+#include "unittest/unittest_helper.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+using namespace std;
+
+const char test[] =
+"4 4 1\n"
+"3  \n"
+"0 0 1\n"
+"0 1 2\n"
+"1 0 3\n"
+"2 1 3\n"
+
+"4 4 1\n"
+"3  \n"
+"0 0 1\n"
+"0 1 2\n"
+"1 1 3\n"
+"2 1 3\n"
+;
+
+int main(int argc, char *argv[])
+{
+  if (argc == 1) {
+    char filename[] = "/tmp/XXXXXX";
+    int fd = mkstemp(filename);
+    write(fd, test, sizeof test-1);
+    close(fd);
+    freopen(filename, "r", stdin);
+    unlink(filename);
+  }
+
+  auto relate = [](long u, long v) {};
+  Fsa a = read_dfa(), b = read_dfa(), fsa = a.intersect(b, relate);
+  print_fsa(fsa);
+
+  if (argc == 1)
+    return 0; // fsa.n() == 4 ? 0 : 1;
+}

+ 39 - 0
yanshi/unittest/minimize_test.cc

@@ -0,0 +1,39 @@
+#include "fsa.hh"
+#include "unittest/unittest_helper.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <type_traits>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+using namespace std;
+
+const char test[] =
+"4 4 1\n"
+"3  \n"
+"0 0 1\n"
+"0 1 2\n"
+"1 0 3\n"
+"2 0 3\n"
+;
+
+int main(int argc, char *argv[])
+{
+  if (argc == 1) {
+    char filename[] = "/tmp/XXXXXX";
+    int fd = mkstemp(filename);
+    write(fd, test, sizeof test-1);
+    close(fd);
+    freopen(filename, "r", stdin);
+    unlink(filename);
+  }
+
+  auto relate = [](const vector<long>&) {};
+  Fsa fsa = read_dfa().minimize(relate);
+  print_fsa(fsa);
+
+  if (argc == 1)
+    return fsa.n() == 3 ? 0 : 1;
+}

+ 46 - 0
yanshi/unittest/union_test.cc

@@ -0,0 +1,46 @@
+#include "fsa.hh"
+#include "unittest/unittest_helper.hh"
+
+#include <algorithm>
+#include <iostream>
+#include <type_traits>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+using namespace std;
+
+const char test[] =
+"4 4 1\n"
+"3  \n"
+"0 0 1\n"
+"0 1 2\n"
+"1 0 3\n"
+"2 1 3\n"
+
+"4 4 1\n"
+"3  \n"
+"0 0 1\n"
+"0 1 2\n"
+"1 1 3\n"
+"2 1 3\n"
+;
+
+int main(int argc, char *argv[])
+{
+  if (argc == 1) {
+    char filename[] = "/tmp/XXXXXX";
+    int fd = mkstemp(filename);
+    write(fd, test, sizeof test-1);
+    close(fd);
+    freopen(filename, "r", stdin);
+    unlink(filename);
+  }
+
+  auto relate = [](long u, long v) {};
+  Fsa a = read_dfa(), b = read_dfa(), fsa = a.union_(b, relate);
+  print_fsa(fsa);
+
+  if (argc == 1)
+    return fsa.n() == 4 ? 0 : 1;
+}

+ 68 - 0
yanshi/unittest/unittest_helper.hh

@@ -0,0 +1,68 @@
+#pragma once
+#include "common.hh"
+#include "fsa.hh"
+
+#include <algorithm>
+#include <sysexits.h>
+#include <err.h>
+#include <assert.h>
+#include <iostream>
+using namespace std;
+
+static Fsa read_nfa()
+{
+  long n, m, k, u, a, v;
+  cin >> n >> m >> k;
+  Fsa r;
+  r.start = 0;
+  r.adj.resize(n);
+  while (k--) {
+    cin >> u;
+    r.finals.push_back(u);
+  }
+  sort(ALL(r.finals));
+  while (m--) {
+    cin >> u >> a >> v;
+    if (u < 0 || u >= n)
+      errx(EX_DATAERR, "%ld: 0 <= u < n", u);
+    if (a < -1 || a >= 256)
+      errx(EX_DATAERR, "%ld: -1 <= c < 256", a);
+    if (v < 0 || v >= n)
+      errx(EX_DATAERR, "%ld: 0 <= v < n", v);
+    r.adj[u].emplace_back(a, v);
+  }
+  assert(cin.good());
+  REP(i, n)
+    sort(ALL(r.adj[i]));
+  return r;
+}
+
+static Fsa read_dfa()
+{
+  Fsa r = read_nfa();
+  REP(i, r.n())
+    if (r.adj[i].size()) {
+      if (r.adj[i][0].first < 0)
+        errx(EX_DATAERR, "epsilon edge found for %ld", i);
+      REP(j, r.adj[i].size()-1)
+        if (r.adj[i][j].first == r.adj[i][j+1].first)
+          errx(EX_DATAERR, "duplicate labels %ld found for %ld", r.adj[i][j].first, i);
+    }
+  assert(cin.good());
+  return r;
+}
+
+static void print_fsa(const Fsa& fsa)
+{
+  printf("finals:");
+  for (long i: fsa.finals)
+    printf(" %ld", i);
+  puts("");
+  puts("edges:");
+  REP(i, fsa.n()) {
+    printf("%ld:", i);
+    for (auto& x: fsa.adj[i])
+      printf(" (%ld,%ld)", x.first, x.second);
+    puts("");
+  }
+}