| | 146 | |
| | 147 | == Python でマルコフ連鎖 == |
| | 148 | * 参考:[http://yamashita.dyndns.org/blog/enhanced-markov-chain-by-python MeCabとPythonでマルコフ連鎖を書いてみる(改)] |
| | 149 | |
| | 150 | {{{ |
| | 151 | #!python |
| | 152 | #!/usr/bin/env python |
| | 153 | # -*- coding: utf-8 -*- |
| | 154 | |
| | 155 | import random |
| | 156 | import MeCab |
| | 157 | import feedparser |
| | 158 | |
| | 159 | def wakati(text): |
| | 160 | t = MeCab.Tagger("-Owakati") |
| | 161 | m = t.parse(text) |
| | 162 | result = m.rstrip(" \n").split(" ") |
| | 163 | return result |
| | 164 | |
| | 165 | def create_markov_table(wordlist, level=2): |
| | 166 | def check_word(words): |
| | 167 | return len([w for w in words if w]) == len(words) |
| | 168 | |
| | 169 | markov = {} |
| | 170 | tmpword = ["" for i in range(level)] |
| | 171 | |
| | 172 | for word in wordlist: |
| | 173 | if check_word(tmpword): |
| | 174 | key = tuple(tmpword) |
| | 175 | try: |
| | 176 | markov[key].append(word) |
| | 177 | except KeyError: |
| | 178 | markov[key] = [word] |
| | 179 | tmpword = tmpword[1:] + [word] |
| | 180 | |
| | 181 | return markov |
| | 182 | |
| | 183 | def gen_sentence(markov, level=2): |
| | 184 | count = 0 |
| | 185 | sentence = [] |
| | 186 | |
| | 187 | tmpword = random.choice(markov.keys()) |
| | 188 | while count < len(wordlist): |
| | 189 | try: |
| | 190 | tmp = random.choice(markov[tmpword]) |
| | 191 | sentence.append(tmp) |
| | 192 | tmpword = tuple(list(tmpword)[1:] + [tmp]) |
| | 193 | except KeyError: |
| | 194 | pass |
| | 195 | count += 1 |
| | 196 | |
| | 197 | return ''.join(sentence) |
| | 198 | |
| | 199 | def cnet_news(): |
| | 200 | f = feedparser.parse("http://feed.japan.cnet.com/rss/index.rdf") |
| | 201 | text = "\n".join([s["summary"].split("\n")[0] for s in f.entries]) |
| | 202 | return text.encode("utf-8") |
| | 203 | |
| | 204 | if __name__ == "__main__": |
| | 205 | level = 4 |
| | 206 | src = cnet_news() |
| | 207 | wordlist = wakati(src) |
| | 208 | markov = create_markov_table(wordlist, level) |
| | 209 | sentence = gen_sentence(markov, level) |
| | 210 | print unicode(sentence, "utf-8") |
| | 211 | }}} |