文章来自http://www.360doc.com/content/09/0219/10/98111_2586955.shtml
以及http://hi.baidu.com/tcet030840zxp/item/c923fe885ddd0329100ef3ec
最近得做一些跟自动机有关的东东,其中一部分可以简要地看作是由一套正则文法生成状态自动机的过程。
什么是正则表达式?首先我们看看什么是正则表达式。这个东东呢,一般用于描述一个字符串的集合——直接说就是一个正则表达式可能可以匹配一大堆字符串,而这一大堆字符串可以形成一个集合,它们的共同特征就是可以被这个正则表达式匹配。就像去死去死团,但是不同的是去死去死团的团员的共同特征是不被任何异性所匹配——但是这没关系,我们不妨取去死去死团的补集就行了。 Mar(y|k|cus) is son of bitch. 真是非常de省力,唯一的缺点是可能对方不知道你在说什么。
下面是第三个基本结构,被称为Kleene star (或者 Kleene 闭包)的。因为这个操作是Stephen Cole Kleene发明的。啊啊,其实正则表达式这个东西就是Kleene发明的。这个同学真是非常的牛B,因为他是更加牛B的 阿隆佐 – 丘奇 ( Alonzo Church )——历史上和阿兰 – 图灵 ( Alan Turing ) 并称的人物——的学生。有多牛B呢,这个Kleene还和他的老师,还有图灵,就递归论发表了论文,奠定了可计算理论的根基。啊哈哈哈,真是牛B啊。
当然咯,还有一些别的操作,比如我们知道的+,?,集合[]等等,但是这些操作都可以通过上面三个基本操作的组合来完成。 什么是NFA?要说NFA嘛,我得先说说DFA。所谓DFA,就是Deterministic Finite state Automata的简称。是状态机的一种。通俗的说,就是一大堆状态的集合,里面有一个初始状态和若干终止状态,每个状态可以有若干个输出边前往别的状态。但是要求每个状态的同一种输出边至多只有一个,所以自动机被称为是”Deterministic”的。 如何转换?刚才Shellex说了,正则表达式有三种基本结构。如果能先将这三种基本结构转换成对应的NFA,然后在用这三种基本结构去拼装成完整的NFA是不是很省力呢? PUSH a To automaton stack PUSH b To automaton stack UNION STAR PUSH c To automaton stack CONCAT PUSH d To automaton stack CONCAT POP result 里面的UNION,STAR 和 CONCAT就是三种基本结构的生成操作了。而这三种基本操作的实现是这样的: POP T from automaton stack CREATE state A, B ADD transition from A to B with 'ε' ADD transition from A to T.entry with 'ε' ADD transition from T.exit to A with 'ε' ADD transition from T.exit to B with 'ε' ADD state A, B to T PUSH T to automaton stack Concat: POP F, S from automaton stack ADD transition from F.exit to S.entry with 'ε' JOIN S to F SET F.exit = S.exit SET F.inputSet += S.inputSet PUSH F to automaton stack Union: POP F, S from automaton stack CREATE state A, B ADD transition from A to F.entry with 'ε' ADD transition from A to S.entry with 'ε' ADD transition from T.exit to B with 'ε' ADD transition from T.exit to B with 'ε' JOIN S to F ADD state A, B to T SET F.exit = S.exit SET F.inputSet += S.inputSet PUSH F to automaton stack |
/* Regular expression implementation. * Supports only ( | ) * + ?. No escapes. * Compiles to NFA and then simulates NFA * using Thompson's algorithm. * * See also http://swtch.com/~rsc/regexp/ and * Thompson, Ken. Regular Expression Search Algorithm, * Communications of the ACM 11(6) (June 1968), pp. 419-422. * * Copyright (c) 2007 Russ Cox. * Can be distributed under the MIT license, see bottom of file. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> /* * Convert infix regexp re to postfix notation. * Insert . as explicit concatenation operator. * Cheesy parser, return static buffer. */ char* re2post(char *re) { int nalt, natom; static char buf[8000]; char *dst; struct { int nalt; int natom; } paren[100], *p; p = paren; dst = buf; nalt = 0; natom = 0; if(strlen(re) >= sizeof buf/2) return NULL; for(; *re; re++){ switch(*re){ case '(': if(natom > 1){ --natom; *dst++ = '.'; } if(p >= paren+100) return NULL; p->nalt = nalt; p->natom = natom; p++; nalt = 0; natom = 0; break; case '|': if(natom == 0) return NULL; while(--natom > 0) *dst++ = '.'; nalt++; break; case ')': if(p == paren) return NULL; if(natom == 0) return NULL; while(--natom > 0) *dst++ = '.'; for(; nalt > 0; nalt--) *dst++ = '|'; --p; nalt = p->nalt; natom = p->natom; natom++; break; case '*': case '+': case '?': if(natom == 0) return NULL; *dst++ = *re; break; default: if(natom > 1){ --natom; *dst++ = '.'; } *dst++ = *re; natom++; break; } } if(p != paren) return NULL; while(--natom > 0) *dst++ = '.'; for(; nalt > 0; nalt--) *dst++ = '|'; *dst = 0; return buf; } /* * Represents an NFA state plus zero or one or two arrows exiting. * if c == Match, no arrows out; matching state. * If c == Split, unlabeled arrows to out and out1 (if != NULL). * If c < 256, labeled arrow with character c to out. */ enum { Match = 256, Split = 257 }; typedef struct State State; struct State { int c; State *out; State *out1; int lastlist; }; State matchstate = { Match }; /* matching state */ int nstate; /* Allocate and initialize State */ State* state(int c, State *out, State *out1) { State *s; nstate++; s = malloc(sizeof *s); s->lastlist = 0; s->c = c; s->out = out; s->out1 = out1; return s; } /* * A partially built NFA without the matching state filled in. * Frag.start points at the start state. * Frag.out is a list of places that need to be set to the * next state for this fragment. */ typedef struct Frag Frag; typedef union Ptrlist Ptrlist; struct Frag { State *start; Ptrlist *out; }; /* Initialize Frag struct. */ Frag frag(State *start, Ptrlist *out) { Frag n = { start, out }; return n; } /* * Since the out pointers in the list are always * uninitialized, we use the pointers themselves * as storage for the Ptrlists. */ union Ptrlist { Ptrlist *next; State *s; }; /* Create singleton list containing just outp. */ Ptrlist* list1(State **outp) { Ptrlist *l; l = (Ptrlist*)outp; l->next = NULL; return l; } /* Patch the list of states at out to point to start. */ void patch(Ptrlist *l, State *s) { Ptrlist *next; for(; l; l=next){ next = l->next; l->s = s; } } /* Join the two lists l1 and l2, returning the combination. */ Ptrlist* append(Ptrlist *l1, Ptrlist *l2) { Ptrlist *oldl1; oldl1 = l1; while(l1->next) l1 = l1->next; l1->next = l2; return oldl1; } /* * Convert postfix regular expression to NFA. * Return start state. */ State* post2nfa(char *postfix) { char *p; Frag stack[1000], *stackp, e1, e2, e; State *s; // fprintf(stderr, "postfix: %sn", postfix); if(postfix == NULL) return NULL; #define push(s) *stackp++ = s #define pop() *--stackp stackp = stack; for(p=postfix; *p; p++){ switch(*p){ default: s = state(*p, NULL, NULL); push(frag(s, list1(&s->out))); break; case '.': /* catenate */ e2 = pop(); e1 = pop(); patch(e1.out, e2.start); push(frag(e1.start, e2.out)); break; case '|': /* alternate */ e2 = pop(); e1 = pop(); s = state(Split, e1.start, e2.start); push(frag(s, append(e1.out, e2.out))); break; case '?': /* zero or one */ e = pop(); s = state(Split, e.start, NULL); push(frag(s, append(e.out, list1(&s->out1)))); break; case '*': /* zero or more */ e = pop(); s = state(Split, e.start, NULL); patch(e.out, s); push(frag(s, list1(&s->out1))); break; case '+': /* one or more */ e = pop(); s = state(Split, e.start, NULL); patch(e.out, s); push(frag(e.start, list1(&s->out1))); break; } } e = pop(); if(stackp != stack) return NULL; patch(e.out, &matchstate); return e.start; #undef pop #undef push } typedef struct List List; struct List { State **s; int n; }; List l1, l2; static int listid; void addstate(List*, State*); void step(List*, int, List*); /* Compute initial state list */ List* startlist(State *start, List *l) { l->n = 0; listid++; addstate(l, start); return l; } /* Check whether state list contains a match. */ int ismatch(List *l) { int i; for(i=0; i<l->n; i++) if(l->s[i] == &matchstate) return 1; return 0; } /* Add s to l, following unlabeled arrows. */ void addstate(List *l, State *s) { if(s == NULL || s->lastlist == listid) return; s->lastlist = listid; if(s->c == Split){ /* follow unlabeled arrows */ addstate(l, s->out); addstate(l, s->out1); return; } l->s[l->n++] = s; } /* * Step the NFA from the states in clist * past the character c, * to create next NFA state set nlist. */ void step(List *clist, int c, List *nlist) { int i; State *s; listid++; nlist->n = 0; for(i=0; i<clist->n; i++){ s = clist->s[i]; if(s->c == c) addstate(nlist, s->out); } } /* Run NFA to determine whether it matches s. */ int match(State *start, char *s) { int i, c; List *clist, *nlist, *t; clist = startlist(start, &l1); nlist = &l2; for(; *s; s++){ c = *s & 0xFF; step(clist, c, nlist); t = clist; clist = nlist; nlist = t; /* swap clist, nlist */ } return ismatch(clist); } int main(int argc, char **argv) { int i; char *post; State *start; if(argc < 3){ fprintf(stderr, "usage: nfa regexp string...n"); return 1; } post = re2post(argv[1]); if(post == NULL){ fprintf(stderr, "bad regexp %sn", argv[1]); return 1; } start = post2nfa(post); if(start == NULL){ fprintf(stderr, "error in post2nfa %sn", post); return 1; } l1.s = malloc(nstate*sizeof l1.s[0]); l2.s = malloc(nstate*sizeof l2.s[0]); for(i=2; i<argc; i++) if(match(start, argv[i])) printf("%sn", argv[i]); return 0; } /* * Permission is hereby granted, free of charge, to any person * obtaining a copy of this software and associated * documentation files (the "Software"), to deal in the * Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, * sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, * subject to the following conditions: * * The above copyright notice and this permission notice shall * be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR * PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS * OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */