#!/usr/bin/env python
# _*_ coding:utf-8_*_

from urllib.request import urlopen
from random import randint


def word_list_sum(word_list):
w_sum = 0
for word, value in word_list.items():
w_sum += value
return w_sum


def retrieve_random_word(word_list):
rand_index = randint(1, word_list_sum(word_list))
for word, value in word_list.items():
rand_index -= value
if rand_index <= 0:
return word


def build_word_dict(text):
# 剔除换行符和引号
text = text.replace("\n", " ")
text = text.replace("\"", "")

# 保证每个标点符号都和前面的单词在一起
# 这样不会被剔除,保留在马尔科夫链中
punctuation = [',', '.', ';', ':']
for symbol in punctuation:
text = text.replace(symbol, " "+symbol+" ")

words = text.split(" ")
# 过滤空单词
words = [word for word in words if word != ""]

word_dict = {}
for i in range(1, len(words)):
if words[i-1] not in word_dict:
# 为单词新建一个词典
word_dict[words[i-1]] = {}
if words[i] not in word_dict[words[i-1]]:
word_dict[words[i-1]][words[i]] = 0
word_dict[words[i-1]][words[i]] = word_dict[words[i-1]][words[i]] + 1

return word_dict


# 翻墙
text = str(urlopen("http://pythonscraping.com/files/inaugurationSpeech.txt").read(), 'utf-8')
word_dict = build_word_dict(text)
# 生成链长为100的马尔可夫链
length = 100
chain = ""
current_word = "I"
for i in range(0, length):
chain += current_word + " "
current_word = retrieve_random_word(word_dict[current_word])

print(chain)