#coding:utf-8 class Apriori(): def __init__(self): pass ''' 关联分析的目标包括两项:发现频繁项集和发现关联规则 ''' ''' 频繁项集:{} 对于包含N种物品的数据集共有2^N-1种项集组合。 支持度(support) 一个项集的支持度被定义为数据集中包含该项集的记录所占的比例。 Apriori算法:如果某个项集是频繁的,那么它的所有子集也是频繁的。 如果一个项集是非频繁集,那么它的所有超集也是非频繁集。 ''' def _createC1(self, dataSet): C1 = [] for transaction in dataSet: for item in transaction: if [item] not in C1: C1.append([item]) C1.sort() return map(frozenset, C1) # use frozen set so we can use it as a key in a dict def _scanD(self, D, Ck, minSupport=0.5): ssCnt = {} for tid in D: for can in Ck: if can.issubset(tid): if can in ssCnt: ssCnt[can] += 1 else: ssCnt[can] = 1 # if can not in ssCnt: # ssCnt[can] = 0 # ssCnt[can] += 1 # print ssCnt numItems = len(D) retList = [] supportK = {} for key in ssCnt: support = ssCnt[key]/float(numItems) # 计算支持度 if support >= minSupport: retList.append(key) supportK[key] = support return retList, supportK def aprioriGen(self, Lk, k): # k>=2 retList = [] lenLk = len(Lk) for i in range(lenLk): for j in range(i+1, lenLk): L1 = list(Lk[i])[:k-2] L2 = list(Lk[j])[:k-2] L1.sort() L2.sort() if L1 == L2: # if first k-2 elements are equal. when k is 3, {0,1},{0,2},{1,2}→{0,1}U{0,2}→{0,1,2} retList.append(Lk[i] | Lk[j]) return retList def apriori(self, dataSet, minSupport=0.5): # minSupport 最小支持度 D = map(set, dataSet) # 转换为集合set C1 = self._createC1(dataSet) # 创建C1,转换为集合frozenset L1, supp1 = self._scanD(D, C1, minSupport) # 基于C1和minSupport创建L1 L = [] supportData = {} L.append(L1) supportData.update(supp1) k = 2 while len(L[k-2]) > 1: Ck = self.aprioriGen(L[k-2], k) # 创建Ck Lk, suppK = self._scanD(D, Ck, minSupport) # 基于Ck和minSupport创建Lk L.append(Lk) supportData.update(suppK) k += 1 return L, supportData ''' 关联规则:→ 可信度(confidence):也称置信度 可信度(尿布→葡萄酒) = 支持度({尿布,葡萄酒})/支持度({尿布}) 如果某条规则并不满足最小可信度要求,那么该规则的所有子集也不会满足最小可信度要求。 ''' def _calcConf(self, freqSet, H, supportData, brl, minConf=0.7): # H为出现在右部的规则列表,如{0},{1} prunedH = [] for conseq in H: conf = supportData[freqSet]/supportData[freqSet-conseq] # 计算可信度 if conf >= minConf: print freqSet-conseq, '-->', conseq, 'conf:', conf brl.append((freqSet-conseq, conseq, conf)) prunedH.append(conseq) return prunedH def _rulesFromConseq(self, freqSet, H, supportData, brl, minConf=0.7): # H为出现在右部的规则列表,如{0},{1} m = len(H[0]) if len(freqSet) > (m+1): Hmp1 = self.aprioriGen(H, m+1) # 合并规则 Hmp = self._calcConf(freqSet, Hmp1, supportData, brl, minConf) # Hmp为出现在右部的合并规则列表,如{0,1} if len(Hmp) > 1: # 如果规则列表长度大于1,则进一步合并 self._rulesFromConseq(freqSet, Hmp, supportData, brl, minConf) def generateRules(self, L, supportData, minConf=0.7): # minConf 最小可信度 bigRuleList = [] for i in range(1, len(L)): # 从包含两个或者更多元素的项集开始规则构建过程 for freqSet in L[i]: H1 = [frozenset([item]) for item in freqSet] # 构建只包含单个元素的列表,即出现在规则右部的规则列表,如{0},{1} if i > 1: self._rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf) # 生成候选规则 else: self._calcConf(freqSet, H1, supportData, bigRuleList, minConf) # 对规则进行评估 return bigRuleList def loadDataSet(): return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]] if __name__ == '__main__': dataSet = loadDataSet() ap = Apriori() L, suppData = ap.apriori(dataSet, minSupport=0.5) print L print suppData rules = ap.generateRules(L, suppData, minConf=0.6) print rules
转载请注明:宁哥的小站 » Apriori算法的实现