- 本文来源及参考资料:
- http://hi.baidu.com/beanchx/item/b153390de0691827a1312d49
- http://www.2cto.com/kf/201302/191521.html
- http://dev.21tx.com/2009/09/26/11354.html
1. 子集构造(Subset Construction)
这是一个转换NFA到DFA的算法。我们知道NFA和DFA的区别最主要的就是一个状态和一个input symbol是否能够确定一个状态的问题,对于NFA,它将确定一个组状态,而DFA将确定一个状态,因此,我们有一个很好的办法就是把NFA的状态集对应每个DFA的状态,这就是subset construction的思想,不过这只是大概泛泛而论,我们需要更加明确的认识
1) NFA在任何一个input symbol下,映射的状态集(通过move函数,这个集合通常用T字母表示)应该被知道
2) 必须保证1)中状态集都对应了DFA中的一个状态
具体算法:
Input : 一个NFA N
Output : 接受相同语言的DFA D
Method : 为D构架一个transition table(转换表) Dtran,每个DFA的状态是一个NFA的状态集合(这里一定要注意前面说过的1)2)两点)。我们定义一些操作:
s 表示NFA的状态,T 表示NFA的状态集合,a表示一个input symbol
ε-transition(ε转换)就是说input symbol为ε时的transition(转换)
操作(operation) |
描述(description) |
ε-closure(s) |
从NFA的状态s出发,只通过ε-transition到达的NFA的状态集合 |
ε-closure(T) |
NFA的集合T中的状态p,只通过ε-transition到达的NFA的状态集合,再求这些集合的交集。用数学表达就是 {p|p 属于 ε-closure(t) , t属于T} |
move(T,a) |
NFA的集合,这个集合在input symbol为a,状态为T中任意状态情况下,通过一个转换得到的集合 |
注意一下,所有的操作都是针对NFA的状态或者状态集合,得到的时NFA的状态集合,或者说是DFA看为一个状态
Subset Construction
初始Dstates,它仅仅含有状态(D的状态)ε-closure(s0),并且状态未被标记,s0表示开始状态,注意,Dstates放的是D的状态
while ( Dstates 有未标记的状态 T ) { // T是D中的一个状态,也是N中一个状态集
标记 T;
for ( input symbol a ){ // 遍历所有的input symbol
U = ε-closure(move(T, a)); // move为NFA的move函数
if ( U 不在 Dstates 中 )
把U作为尚未标记的状态加入Dstates;
Dtran[T, a] = U
}
}
注意,状态s,ε-closure(s)一定包含s
我们先来熟悉上面的操作operation,再来看上面的算法
ε-closure(0) = {0, 1, 2, 4, 7} // 从0状态出发的,input symbol为ε的所有状态的集合
ε-closure(3) = {1, 2, 3, 4, 6, 7}
ε-closure(8) = {8}
ε-closure( {3, 8} ) = ε-closure(3) U ε-closure(8) = {1, 2, 3, 4, 6, 7, 8}
move(0,a) = 空
move(7,a) = {8}
move(8,b) = {9}
move( {0, 1, 2, 4, 7}, a) = move(0,a) U move(1,a) U move(2,a) U move(4,a) U move(7,a) = {3, 8}
现在可以回去理解一下算法了。
这里再说说求ε-closure(T)的算法:
把T的所有状态压入stack(栈);
ε-closure(T)的初始值为 T 中的所有元素 ; // 也就是一定包含他们本身
while( 栈非空 ) {
弹出栈顶元素 t ;
for( 每个属于 move(t, ε) 的状态 u ){
if( u 不在 ε-closure(T) 中 ){
u 加入 ε-closure(T);
把 u 入栈;
}
}
}
下面对上图如何使用Set Construction算法来构建DFA做一个详细的描述:
1. 初始化Dstates 把集合 ε-closure(s0) = {0, 1, 2, 4, 7}作为第一个状态,设此状态为 A
2. 现在转化,input symbol {a, b},因此,求:
ε-closure(move(A, a));
ε-closure(move(A, b));
这里会得到2个状态
ε-closure(move(A, a)) = {1, 2, 3, 4, 6, 7, 8},设其为 B
ε-closure(move(A, b)) = {1, 2, 4, 5, 6, 7}, 设其为C
B,C放入Dstates
改写 Dtrans
最终得到的 Dtrans 为:
A = {0, 1, 2, 4, 7}
B = {1, 2, 3, 4, 6, 7, 8}
C = {1, 2, 4, 5, 6, 7}
D = {1, 2, 4, 5, 6, 7, 9}
因此,NFA转化成为DFA:
=========================================
DFA的状态
struct DFA_State { setcontent; bool bFlag; #ifdef _DEBUG uint idx; #endif DFA_State(const set & x) : content(x), bFlag(false) { #ifdef _DEBUG idx = inc(); #endif } inline const bool operator==(const DFA_State& x)const { if (&x == this) return true; return content == x.content; } #ifdef _DEBUG inline uint inc() { static uint i = 0; return i++; } #endif };
struct DFA_Edge { struct { Char_Type char_value; String_Type string_value; }data; enum Edge_Type { TUnknown = 0, TNot = 1, TChar = 2, TString = 4 }; uchar edge_type; DFA_State* pFrom; DFA_State* pTo; DFA_Edge(const Char_Type& x, bool bNot, DFA_State* pFrom, DFA_State* pTo) : pFrom(pFrom), pTo(pTo) { data.char_value = x; edge_type = bNot ? (TChar | TNot) : TChar; } DFA_Edge(const String_Type& x, bool bNot, DFA_State* pFrom, DFA_State* pTo) : pFrom(pFrom), pTo(pTo) { data.string_value = x; edge_type = bNot ? (TString | TNot) : TString; } inline const bool isNot()const { return (edge_type & TNot) == TNot; } inline const bool isChar()const { return (edge_type & TChar) == TChar; } inline const bool isString()const { return (edge_type & TString) == TString; } const Edge_Type edgeType()const { if (isChar()) return TChar; else if (isString()) return TString; else return TUnknown; } const bool operator<(const DFA_Edge& x)const { return (ulong)pFrom + pTo < (ulong)x.pFrom + x.pTo; } const bool operator==(const DFA_Edge& x)const { return pFrom == x.pFrom && pTo == x.pTo; } };
DFA_State* pDFAStart; setpDFAEnds; set dfa_Edges;
void epsilonClosure(const set& k, EpsilonClosureInfo& info) { for (typename set ::const_iterator i = k.begin(), m = k.end(); i != m; ++i) { info.states.insert(*i); set tmp; tmp.insert(*i); while (!tmp.empty()) { EpsilonNFA_State* pState = *tmp.begin(); for (typename vector ::const_iterator j = epsilonNFA_Edges[pState].begin(), n = epsilonNFA_Edges[pState].end(); j != n; ++j) { if (j->isEpsilon()) { if (info.states.insert(j->pTo).second) tmp.insert(j->pTo); } else if (j->isChar()) info.chars.insert(pair (j->data.char_value, j->isNot())); else if (j->isString()) info.strings.insert(pair (j->data.string_value, j->isNot())); } tmp.erase(pState); } } }
struct EpsilonClosureInfo { setstates; set > chars; set > strings; EpsilonClosureInfo() {} EpsilonClosureInfo(const set & states, const set >& chars, const set >& strings) : states(states) , chars(chars) , strings(strings) {} EpsilonClosureInfo(const EpsilonClosureInfo& x) { states = x.states; chars = x.chars; strings = x.strings; } };
setmove(const DFA_State& t, const Char_Type& c, bool bNot) { set result; for (typename set ::const_iterator i = t.content.begin(), m = t.content.end(); i != m; ++i) { for (typename vector ::const_iterator j = epsilonNFA_Edges[*i].begin(), n = epsilonNFA_Edges[*i].end(); j != n; ++j) { if (j->isChar() && j->data.char_value == c && j->isNot() == bNot) result.insert(j->pTo); } } return result; } set move(const DFA_State& t, const String_Type& s, bool bNot) { set result; for (typename set ::const_iterator i = t.content.begin(), m = t.content.end(); i != m; ++i) { for (typename vector ::const_iterator j = epsilonNFA_Edges[*i].begin(), n = epsilonNFA_Edges[*i].end(); j != n; ++j) { if (j->isString() && j->data.string_value == s && j->isNot() == bNot) result.insert(j->pTo); } } return result; }
void buildDFA() { setstart; start.insert(pEpsilonStart); typedef pair c_type; map > c; queue c2; pDFAStart = DFA_State_Alloc::allocate(); EpsilonClosureInfo info; epsilonClosure(start, info); construct(pDFAStart, info.states); c_type ct(pDFAStart, info); c[info.states.size()].push_back(ct); c2.push(ct); if (isEndDFAStatus(pDFAStart)) pDFAEnds.insert(pDFAStart); context.dfa_States.insert(pDFAStart); while (!c2.empty()) { DFA_State* t = c2.front().first; set > chars = c2.front().second.chars; set > strings = c2.front().second.strings; t->bFlag = true; for (typename set >::const_iterator i = chars.begin(), m = chars.end(); i != m; ++i) { EpsilonClosureInfo info; epsilonClosure(move(*t, i->first, i->second), info); DFA_State* p = getDFAState(info.states, c); if (p) // 如果这个状态已存在 { dfa_Edges.insert(DFA_Edge(i->first, i->second, t, p)); } else { DFA_State* pState = DFA_State_Alloc::allocate(); construct(pState, info.states); context.dfa_States.insert(pState); if (isEndDFAStatus(pState)) pDFAEnds.insert(pState); c_type ct(pState, info); c[info.states.size()].push_back(ct); c2.push(ct); dfa_Edges.insert(DFA_Edge(i->first, i->second, t, pState)); } } for (typename set >::const_iterator i = strings.begin(), m = strings.end(); i != m; ++i) { EpsilonClosureInfo info; epsilonClosure(move(*t, i->first, i->second), info); DFA_State* p = getDFAState(info.states, c); if (p) // 如果这个状态已存在 { dfa_Edges.insert(DFA_Edge(i->first, i->second, t, p)); } else { DFA_State* pState = DFA_State_Alloc::allocate(); construct(pState, info.states); context.dfa_States.insert(pState); if (isEndDFAStatus(pState)) pDFAEnds.insert(pState); c_type ct(pState, info); c[info.states.size()].push_back(ct); c2.push(ct); dfa_Edges.insert(DFA_Edge(i->first, i->second, t, pState)); } } c2.pop(); } }
void printDFA() { printf("---------- DFA Start ----------n"); settmp; for (typename set ::const_iterator i = dfa_Edges.begin(), m = dfa_Edges.end(); i != m; ++i) { printf("%03d -> %03d", i->pFrom->idx, i->pTo->idx); switch (i->edgeType()) { case DFA_Edge::TChar: printf("(%c)", i->data.char_value); break; case DFA_Edge::TString: printf("(%s)", i->data.string_value.c_str()); break; default: break; } if (i->isNot()) printf("(not)"); printf("n"); tmp.insert(i->pFrom); tmp.insert(i->pTo); } printf("start: %03d -> ends: ", pDFAStart->idx); for (typename set ::const_iterator i = pDFAEnds.begin(), m = pDFAEnds.end(); i != m; ++i) { printf("%03d ", (*i)->idx); } printf("n"); #if DEBUG_LEVEL == 3 printf("-------------------------------n"); for (typename set ::const_iterator i = tmp.begin(), m = tmp.end(); i != m; ++i) { printf("State: %03dn", (*i)->idx); for (typename set ::const_iterator j = (*i)->content.begin(), n = (*i)->content.end(); j != n; ++j) { printf("%03d ", (*j)->idx); } printf("n"); } #endif printf("----------- DFA End -----------n"); }
Rule_Type::Context context; Rule_Type a('a', context), b('b', context), d('d', context); Rule_Type result = (a - d).opt() + (+b | !(a + b)); result.buildDFA(); #ifdef _DEBUG result.printEpsilonNFA(); result.printDFA(); #endif

