#!/usr/bin/python
# coding:utf-8
from math import log


def createDataset(): #加载数据返回标签和数据集
dataSet = [[1, 1, 'yes'], [1, 1, 'yes'], [
1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
labels = ['no surfacing', 'flippers']
return labels, dataSet


def calcShannonEnt(dataSet):#计算香侬熵
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2)
return shannonEnt


def splitDataSet(dataSet, axis, value):#划分数据集,
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
retDataSet.append(featVec[axis + 1:])
return retDataSet


def chooseBestFeatureToSplit(dataSet):#计算有最大香侬熵的特征
numFeature = len(dataSet[0]) - 1
for i in range(numFeature):
1
feature = [example[i] for example in dataSet]
print(feature)


labels, dataSet = createDataset()
calcShannonEnt(dataSet)
chooseBestFeatureToSplit(dataSet)