1. 問題描述
馬爾可夫鏈算法用于生成一段隨機(jī)的英文,其思想非常簡(jiǎn)單。首先讀入數(shù)據(jù),然后將讀入的數(shù)據(jù)分成前綴和后綴兩部分,通過前綴來隨機(jī)獲取后綴,籍此產(chǎn)生一段可讀的隨機(jī)英文。
為了說明方便,假設(shè)我們有如下一段話:
假設(shè)前綴的長(zhǎng)度為2,則我們處理輸入以后得到如下數(shù)據(jù),我們首先獲取一個(gè)前綴,然后在前綴的后綴列表中隨機(jī)選擇一個(gè)單詞,然后改變前綴,重復(fù)上述過程,這樣,我們產(chǎn)生的句子將是可讀的。
下面是處理過的數(shù)據(jù):
前綴 后綴
show your flowcharts tables
your flowcharts and will
flowcharts and conceal
flowcharts will be
your tables and and
will be mystified. obvious.
be mystified. show
be obvious. (end)
處理這個(gè)文本的馬爾可夫鏈算法將首先帶引show your,然后隨機(jī)取出flowcharts 或者table 兩個(gè)單詞,假設(shè)選擇的是flowcharts, 則新的前綴就是your flowcharts,同理,選擇table 時(shí),新的前綴就是your table,有了新的前綴your flowcharts 以后,再次隨即選擇它的后綴,這次是在and 和 will 中隨機(jī)選擇,重復(fù)上述過程,就能夠產(chǎn)生一段可讀的文本。具體描述如下:
設(shè)置 w1 和 w2 為文本的前兩個(gè)詞
輸出 w1 和 w2
循環(huán):
隨機(jī)地選出 w3,它是文本中 w1 w2 的后綴中的一個(gè)
打印 w3
把 w1 和 w2 分別換成 w2 和 w3
重復(fù)循環(huán)
2.awk 程序
馬爾可夫鏈算法并不難,我們會(huì)在后面看到,用c語(yǔ)言來解決這個(gè)問題會(huì)相當(dāng)麻煩,而用awk則只需要5分鐘就能搞定。這簡(jiǎn)直就是一個(gè)演示awk優(yōu)點(diǎn)的問題。
awk 中有關(guān)聯(lián)數(shù)組,正好可以用來表示前綴和后綴的關(guān)系。程序如下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
# markov.awk: markov chain algorithm for 2-word prefixes BEGIN { MAXGEN = 10000; NONWORD = "\n" ; w1 = w2 = NONWORD } { for (i = 1; i <= NF; i++) { # read all words statetab[w1,w2,++nsuffix[w1,w2]] = $i w1 = w2 w2 = $i } } END { statetab[w1,w2,++nsuffix[w1,w2]] = NONWORD # add tail w1 = w2 = NONWORD for (i = 0; i < MAXGEN; i++) { # generate r = int(rand()*nsuffix[w1,w2]) + 1 # nsuffix >= 1 p = statetab[w1,w2,r] if (p == NONWORD) exit print p w1 = w2 # advance chain w2 = p } } |
3. C++ 程序
該問題的主要難點(diǎn)就在于通過前綴隨機(jī)的獲取后綴,在C++ 中,我們可以借助map 來實(shí)現(xiàn)前綴和后綴的對(duì)應(yīng)關(guān)系,以此得到較高的開發(fā)效率。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
|
/* Copyright (C) 1999 Lucent Technologies */ /* Excerpted from 'The Practice of Programming' */ /* by Brian W. Kernighan and Rob Pike */ #include <time.h> #include <iostream> #include <string> #include <deque> #include <map> #include <vector> using namespace std; const int NPREF = 2; const char NONWORD[] = "\n" ; // cannot appear as real line: we remove newlines const int MAXGEN = 10000; // maximum words generated typedef deque<string> Prefix; map<Prefix, vector<string> > statetab; // prefix -> suffixes void build(Prefix&, istream&); void generate( int nwords); void add(Prefix&, const string&); // markov main: markov-chain random text generation int main( void ) { int nwords = MAXGEN; Prefix prefix; // current input prefix srand ( time (NULL)); for ( int i = 0; i < NPREF; i++) add(prefix, NONWORD); build(prefix, cin); add(prefix, NONWORD); generate(nwords); return 0; } // build: read input words, build state table void build(Prefix& prefix, istream& in) { string buf; while (in >> buf) add(prefix, buf); } // add: add word to suffix deque, update prefix void add(Prefix& prefix, const string& s) { if (prefix.size() == NPREF) { statetab[prefix].push_back(s); prefix.pop_front(); } prefix.push_back(s); } // generate: produce output, one word per line void generate( int nwords) { Prefix prefix; int i; for (i = 0; i < NPREF; i++) add(prefix, NONWORD); for (i = 0; i < nwords; i++) { vector<string>& suf = statetab[prefix]; const string& w = suf[ rand () % suf.size()]; if (w == NONWORD) break ; cout << w << "\n" ; prefix.pop_front(); // advance prefix.push_back(w); } } |
4. c 程序
如果需要程序運(yùn)行得足夠快,那就只能用較低級(jí)的語(yǔ)言來實(shí)現(xiàn)了。當(dāng)我們用c 語(yǔ)言來實(shí)現(xiàn)時(shí),就不得不考慮各種各樣的問題了。首先,面臨的第一個(gè)問題就是,如何表示前綴和后綴的關(guān)系?
這里采用前綴的key,后綴為value 的方式存儲(chǔ)前綴與后綴的關(guān)系,我們知道,hash表的查找速度最快,所以,這里采用hash表也是情理之中的事,只是看你能不能想到,用前綴作key,基于上面的思路,再仔細(xì)一點(diǎn),就沒有什么大問題了。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
|
/* Copyright (C) 1999 Lucent Technologies */ /* Excerpted from 'The Practice of Programming' */ /* by Brian W. Kernighan and Rob Pike */ /* * Markov chain random text generator. */ #include <string.h> #include <stdlib.h> #include <stdio.h> #include <string.h> #include <time.h> #include "eprintf.h" enum { NPREF = 2, /* number of prefix words */ NHASH = 4093, /* size of state hash table array */ MAXGEN = 10000 /* maximum words generated */ }; typedef struct State State; typedef struct Suffix Suffix; struct State { /* prefix + suffix list */ char *pref[NPREF]; /* prefix words */ Suffix *suf; /* list of suffixes */ State *next; /* next in hash table */ }; struct Suffix { /* list of suffixes */ char *word; /* suffix */ Suffix *next; /* next in list of suffixes */ }; State *lookup( char *prefix[], int create); void build( char *prefix[], FILE *); void generate( int nwords); void add( char *prefix[], char *word); State *statetab[NHASH]; /* hash table of states */ char NONWORD[] = "\n" ; /* cannot appear as real word */ /* markov main: markov-chain random text generation */ int main( void ) { int i, nwords = MAXGEN; char *prefix[NPREF]; /* current input prefix */ int c; long seed; setprogname( "markov" ); seed = time (NULL); srand (seed); for (i = 0; i < NPREF; i++) /* set up initial prefix */ prefix[i] = NONWORD; build(prefix, stdin); add(prefix, NONWORD); generate(nwords); return 0; } const int MULTIPLIER = 31; /* for hash() */ /* hash: compute hash value for array of NPREF strings */ unsigned int hash( char *s[NPREF]) { unsigned int h; unsigned char *p; int i; h = 0; for (i = 0; i < NPREF; i++) for (p = (unsigned char *) s[i]; *p != '\0' ; p++) h = MULTIPLIER * h + *p; return h % NHASH; } /* lookup: search for prefix; create if requested. */ /* returns pointer if present or created; NULL if not. */ /* creation doesn't strdup so strings mustn't change later. */ State* lookup( char *prefix[NPREF], int create) { int i, h; State *sp; h = hash(prefix); for (sp = statetab[h]; sp != NULL; sp = sp->next) { for (i = 0; i < NPREF; i++) if ( strcmp (prefix[i], sp->pref[i]) != 0) break ; if (i == NPREF) /* found it */ return sp; } if (create) { sp = (State *) emalloc( sizeof (State)); for (i = 0; i < NPREF; i++) sp->pref[i] = prefix[i]; sp->suf = NULL; sp->next = statetab[h]; statetab[h] = sp; } return sp; } /* addsuffix: add to state. suffix must not change later */ void addsuffix(State *sp, char *suffix) { Suffix *suf; suf = (Suffix *) emalloc( sizeof (Suffix)); suf->word = suffix; suf->next = sp->suf; sp->suf = suf; } /* add: add word to suffix list, update prefix */ void add( char *prefix[NPREF], char *suffix) { State *sp; sp = lookup(prefix, 1); /* create if not found */ addsuffix(sp, suffix); /* move the words down the prefix */ memmove (prefix, prefix+1, (NPREF-1)* sizeof (prefix[0])); prefix[NPREF-1] = suffix; } /* build: read input, build prefix table */ void build( char *prefix[NPREF], FILE *f) { char buf[100], fmt[10]; /* create a format string; %s could overflow buf */ sprintf (fmt, "%%%ds" , sizeof (buf)-1); while ( fscanf (f, fmt, buf) != EOF) add(prefix, estrdup(buf)); } /* generate: produce output, one word per line */ void generate( int nwords) { State *sp; Suffix *suf; char *prefix[NPREF], *w; int i, nmatch; for (i = 0; i < NPREF; i++) /* reset initial prefix */ prefix[i] = NONWORD; for (i = 0; i < nwords; i++) { sp = lookup(prefix, 0); if (sp == NULL) eprintf( "internal error: lookup failed" ); nmatch = 0; for (suf = sp->suf; suf != NULL; suf = suf->next) if ( rand () % ++nmatch == 0) /* prob = 1/nmatch */ w = suf->word; if (nmatch == 0) eprintf( "internal error: no suffix %d %s" , i, prefix[0]); if ( strcmp (w, NONWORD) == 0) break ; printf ( "%s\n" , w); memmove (prefix, prefix+1, (NPREF-1)* sizeof (prefix[0])); prefix[NPREF-1] = w; } } |