数据挖掘课后作业

资源描述

数据挖掘课后作业5.4（实现项目）使用你熟悉的程序设计语言（如C+或Java），实现本章介绍的三种频繁项集挖掘算法：（1）Apriori AS94B，（2）FP增长HPY00和（3）ECLAT Zak00（使用垂直数据格式挖掘）。在各种类型的大型数据集上比较每种算法的性能。写一个报告，分析在哪些情况下（如数据大小、数据分布、最小支持阀度值设置和模式的稠密度），某种算法比其他算法好，并陈述理由。三种算法的比较1、对与项集较大，频繁项集较分散，是一个稀疏型的数据集，性能为AprioriFP-growthEclat 2、对与数据集的项集较小，数据非常稠密的数据集，性能为：FP-growthAprioriEclat各算法采用的数据表示模式及挖掘策略不同。采用优化措施后的 Apriori算法，对于非稠密数据己经具有较高的效率，其性能甚至优于FP-growth 算法;但由于其采用的是广度优先的挖掘策略，对稠密数据效率仍较差。而 Eclat 算法采用的纵向表示法，对数据集较小的稠密数据，效率相对较高;但对于数据集较大的稀疏数据，效率较低,FP一树浓缩了数据库的主要信息，分而治之的挖掘策略也使挖掘问题的复杂程度有所降低。答：（1）Apriori算法的实现：使用Java语言实现Apriori算法，AprioriAlgorithm类包含了频繁项集的挖掘过程和频繁关联规则的挖掘过程；ProperSubsetCombination辅助类用于计算一个频繁项集的真子集，采用组合原理，基于数值编码原理实现的组合求解集合的真子集。Apriori算法的核心实现类为AprioriAlgorithm，实现的Java代码如下所示：（一）核心类package org.shirdrn.datamining.association;import java.util.HashMap;import java.util.HashSet;import java.util.Iterator;import java.util.Map;import java.util.Set;import java.util.TreeMap;/* 关联规则挖掘：Apriori算法* * 该算法基本上按照Apriori算法的基本思想来实现的。*/public class AprioriAlgorithm private MapInteger, Set txDatabase; / 事务数据库private Float minSup; / 最小支持度private Float minConf; / 最小置信度private Integer txDatabaseCount; / 事务数据库中的事务数private MapInteger, SetSet freqItemSet; / 频繁项集集合private MapSet, SetSet assiciationRules; / 频繁关联规则集合public AprioriAlgorithm( MapInteger, Set txDatabase, Float minSup, Float minConf) this.txDatabase = txDatabase; this.minSup = minSup; this.minConf = minConf; this.txDatabaseCount = this.txDatabase.size(); freqItemSet = new TreeMapInteger, SetSet(); assiciationRules = new HashMapSet, SetSet();/* 扫描事务数据库，计算频繁1-项集* return*/public MapSet, Float getFreq1ItemSet() MapSet, Float freq1ItemSetMap = new HashMapSet, Float(); MapSet, Integer candFreq1ItemSet = this.getCandFreq1ItemSet(); IteratorMap.EntrySet, Integer it = candFreq1ItemSet.entrySet().iterator(); while(it.hasNext() Map.EntrySet, Integer entry = it.next(); / 计算支持度 Float supported = new Float(entry.getValue().toString()/new Float(txDatabaseCount); if(supported=minSup) freq1ItemSetMap.put(entry.getKey(), supported); return freq1ItemSetMap;/* 计算候选频繁1-项集* return*/public MapSet, Integer getCandFreq1ItemSet() MapSet, Integer candFreq1ItemSetMap = new HashMapSet, Integer(); IteratorMap.EntryInteger, Set it = txDatabase.entrySet().iterator(); / 统计支持数，生成候选频繁1-项集 while(it.hasNext() Map.EntryInteger, Set entry = it.next(); Set itemSet = entry.getValue(); for(String item : itemSet) Set key = new HashSet(); key.add(item.trim(); if(!candFreq1ItemSetMap.containsKey(key) Integer value = 1; candFreq1ItemSetMap.put(key, value); else Integer value = 1+candFreq1ItemSetMap.get(key); candFreq1ItemSetMap.put(key, value); return candFreq1ItemSetMap;/* 根据频繁(k-1)-项集计算候选频繁k-项集* * param m 其中m=k-1* param freqMItemSet 频繁(k-1)-项集* return*/public SetSet aprioriGen(int m, SetSet freqMItemSet) SetSet candFreqKItemSet = new HashSetSet(); IteratorSet it = freqMItemSet.iterator(); Set originalItemSet = null; while(it.hasNext() originalItemSet = it.next(); IteratorSet itr = this.getIterator(originalItemSet, freqMItemSet); while(itr.hasNext() Set identicalSet = new HashSet(); / 两个项集相同元素的集合(集合的交运算) identicalSet.addAll(originalItemSet); Set set = itr.next(); identicalSet.retainAll(set); / identicalSet中剩下的元素是identicalSet与set集合中公有的元素 if(identicalSet.size() = m-1) / (k-1)-项集中k-2个相同 Set differentSet = new HashSet(); / 两个项集不同元素的集合(集合的差运算) differentSet.addAll(originalItemSet); differentSet.removeAll(set); / 因为有k-2个相同，则differentSet中一定剩下一个元素，即differentSet大小为1 differentSet.addAll(set); / 构造候选k-项集的一个元素(set大小为k-1,differentSet大小为k) candFreqKItemSet.add(differentSet); / 加入候选k-项集集合 return candFreqKItemSet;/* 根据一个频繁k-项集的元素(集合)，获取到频繁k-项集的从该元素开始的迭代器实例* param itemSet* param freqKItemSet 频繁k-项集* return*/private IteratorSet getIterator(Set itemSet, SetSet freqKItemSet) IteratorSet it = freqKItemSet.iterator(); while(it.hasNext() if(itemSet.equals(it.next() break; return it;/* 根据频繁(k-1)-项集，调用aprioriGen方法，计算频繁k-项集* * param k * param freqMItemSet 频繁(k-1)-项集* return*/public MapSet, Float getFreqKItemSet(int k, SetSet freqMItemSet) MapSet, Integer candFreqKItemSetMap = new HashMapSet, Integer(); / 调用aprioriGen方法，得到候选频繁k-项集 SetSet candFreqKItemSet = this.aprioriGen(k-1, freqMItemSet); / 扫描事务数据库 IteratorMap.EntryInteger, Set it = txDatabase.entrySet().iterator(); / 统计支持数 while(it.hasNext() Map.EntryInteger, Set entry = it.next(); IteratorSet kit = candFreqKItemSet.iterator(); while(kit.hasNext() Set kSet = kit.next(); Set set = new HashSet(); set.addAll(kSet); set.removeAll(entry.getValue(); / 候选频繁k-项集与事务数据库中元素做差元算 if(set.isEmpty() / 如果拷贝set为空，支持数加1 if(candFreqKItemSetMap.get(kSet) = null) Integer value = 1; candFreqKItemSetMap.put(kSet, value); else Integer value = 1+candFreqKItemSetMap.get(kSet); candFreqKItemSetMap.put(kSet, value); / 计算支持度，生成频繁k-项集，并返回 return support(candFreqKItemSetMap);/* 根据候选频繁k-项集，得到频繁k-项集* * param candFreqKItemSetMap 候选k项集(包含支持计数)*/public MapSet, Float support(MapSet, Integer candFreqKItemSetMap) MapSet, Float freqKItemSetMap = new HashMapSet, Float(); IteratorMap.EntrySet, Integer it = candFreqKItemSetMap.entrySet().iterator(); while(it.hasNext() Map.EntrySet, Integer entry = it.next(); / 计算支持度 Float supportRate = new Float(entry.getValue().toString()/new Float(txDatabaseCount); if(supportRateminSup) / 如果不满足最小支持度，删除 it.remove(); else freqKItemSetMap.put(entry.getKey(), supportRate); return freqKItemSetMap;/* 挖掘全部频繁项集*/public void mineFreqItemSet() / 计算频繁1-项集 SetSet freqKItemSet = this.getFreq1ItemSet().keySet(); freqItemSet.put(1, freqKItemSet); / 计算频繁k-项集(k1) int k = 2; while(true) MapSet, Float freqKItemSetMap = this.getFreqKItemSet(k, freqKItemSet); if(!freqKItemSetMap.isEmpty() this.freqItemSet.put(k, freqKItemSetMap.keySet(); freqKItemSet = freqKItemSetMap.keySet(); else break; k+; /* 挖掘频繁关联规则* 首先挖掘出全部的频繁项集，在此基础上挖掘频繁关联规则*/public void mineAssociationRules() freqItemSet.remove(1); / 删除频繁1-项集 IteratorMap.EntryInteger, SetSet it = freqItemSet.entrySet().iterator(); while(it.hasNext() Map.EntryInteger, SetSet entry = it.next(); for(Set itemSet : entry.getValue() / 对每个频繁项集进行关联规则的挖掘 mine(itemSet); /* 对从频繁项集集合freqItemSet中每迭代出一个频繁项集元素，执行一次关联规则的挖掘* param itemSet 频繁项集集合freqItemSet中的一个频繁项集元素*/public void mine(Set itemSet) int n = itemSet.size()/2; / 根据集合的对称性，只需要得到一半的真子集 for(int i=1; i=n; i+) / 得到频繁项集元素itemSet的作为条件的真子集集合 SetSet properSubset = ProperSubsetCombination.getProperSubset(i, itemSet); / 对条件的真子集集合中的每个条件项集，获取到对应的结论项集，从而进一步挖掘频繁关联规则 for(Set conditionSet : properSubset) Set conclusionSet = new HashSet(); conclusionSet.addAll(itemSet); conclusionSet.removeAll(conditionSet); / 删除条件中存在的频繁项 confide(conditionSet, conclusionSet); / 调用计算置信度的方法，并且挖掘出频繁关联规则 /* 对得到的一个条件项集和对应的结论项集，计算该关联规则的支持计数，从而根据置信度判断是否是频繁关联规则* param conditionSet 条件频繁项集* param conclusionSet 结论频繁项集*/public void confide(Set conditionSet, Set conclusionSet) / 扫描事务数据库 IteratorMap.EntryInteger, Set it = txDatabase.entrySet().iterator(); / 统计关联规则支持计数 int conditionToConclusionCnt = 0; / 关联规则(条件项集推出结论项集)计数 int conclusionToConditionCnt = 0; / 关联规则(结论项集推出条件项集)计数 int supCnt = 0; / 关联规则支持计数 while(it.hasNext() Map.EntryInteger, Set entry = it.next(); Set txSet = entry.getValue(); Set set1 = new HashSet(); Set set2 = new HashSet(); set1.addAll(conditionSet); set1.removeAll(txSet); / 集合差运算：set-txSet if(set1.isEmpty() / 如果set为空，说明事务数据库中包含条件频繁项conditionSet / 计数 conditionToConclusionCnt+; set2.addAll(conclusionSet); set2.removeAll(txSet); / 集合差运算：set-txSet if(set2.isEmpty() / 如果set为空，说明事务数据库中包含结论频繁项conclusionSet / 计数 conclusionToConditionCnt+; if(set1.isEmpty() & set2.isEmpty() supCnt+; / 计算置信度 Float conditionToConclusionConf = new Float(supCnt)/new Float(conditionToConclusionCnt); if(conditionToConclusionConf=minConf) if(assiciationRules.get(conditionSet) = null) / 如果不存在以该条件频繁项集为条件的关联规则 SetSet conclusionSetSet = new HashSetSet(); conclusionSetSet.add(conclusionSet); assiciationRules.put(conditionSet, conclusionSetSet); else assiciationRules.get(conditionSet).add(conclusionSet); Float conclusionToConditionConf = new Float(supCnt)/new Float(conclusionToConditionCnt); if(conclusionToConditionConf=minConf) if(assiciationRules.get(conclusionSet) = null) / 如果不存在以该结论频繁项集为条件的关联规则 SetSet conclusionSetSet = new HashSetSet(); conclusionSetSet.add(conditionSet); assiciationRules.put(conclusionSet, conclusionSetSet); else assiciationRules.get(conclusionSet).add(conditionSet); /* 经过挖掘得到的频繁项集Map* * return 挖掘得到的频繁项集集合*/public MapInteger, SetSet getFreqItemSet() return freqItemSet;/* 获取挖掘到的全部的频繁关联规则的集合* return 频繁关联规则集合*/public MapSet, SetSet getAssiciationRules() return assiciationRules;（二）辅助类ProperSubsetCombination类是一个辅助类，在挖掘频繁关联规则的过程中，用于生成一个频繁项集元素的非空真子集，实现代码如下：package org.shirdrn.datamining.association;import java.util.BitSet;import java.util.HashSet;import java.util.Set;/* 求频繁项集元素(集合)的非空真子集集合* 从一个集合（大小为n）中取出m(m属于2n/2的闭区间)个元素的组合实现类，获取非空真子集的集合*/public class ProperSubsetCombination private static String array;private static BitSet startBitSet; / 比特集合起始状态private static BitSet endBitSet; / 比特集合终止状态，用来控制循环private static SetSet properSubset; / 真子集集合/* 计算得到一个集合的非空真子集集合* * param n 真子集的大小* param itemSet 一个频繁项集元素* return 非空真子集集合*/public static SetSet getProperSubset(int n, Set itemSet) String array = new StringitemSet.size(); ProperSubsetCombination.array = itemSet.toArray(array); properSubset = new HashSetSet(); startBitSet = new BitSet(); endBitSet = new BitSet(); / 初始化startBitSet，左侧占满1 for (int i=0; i=array.length-n; i-) endBitSet.set(i, true); / 根据起始startBitSet，将一个组合加入到真子集集合中 get(startBitSet); while(!startBitSet.equals(endBitSet) int zeroCount = 0; / 统计遇到10后，左边0的个数 int oneCount = 0; / 统计遇到10后，左边1的个数 int pos = 0; / 记录当前遇到10的索引位置 / 遍历startBitSet来确定10出现的位置 for (int i=0; i1 & counter0) pos-; endIndex = pos; for (int i=0; i0) endIndex = pos; get(startBitSet); return properSubset;/* 根据一次移位操作得到的startBitSet，得到一个真子集* param bitSet*/private static void get(BitSet bitSet) Set set = new HashSet(); for(int i=0; iarray.length; i+) if(bitSet.get(i) set.add(arrayi); properSubset.add(set);（2）FP增长算法的实现：使用C+语言实现，实现的代码如下：（一）头文件#include using namespace std;#include #include #include #include #include #define debug(a) printf(a) struct Item string item_name;/项目 Item *node_link; /节点链 Item *parent_link; /父节点链 map child_link; /孩子节点 int support_count; /支持度; void print(vector vector ssvec) for(int ii=0;iissvec.size();ii+) vector ss=ssvecii; for(int jj=0;jjss.size();jj+) coutitem_name:support_count ; coutsupport_count!=b-support_count) return a-support_countb-support_count; return a-item_nameitem_name;#ifndef _PF_TREE_H#define_PF_TREE_Hclass PFTree map head_table; void insert(Item * root,Item *vec,int curP,int maxSize,int );public : Item * root; vector T; int min_support; void build_PFTree(vector vector );/建表 void init_head_table(vector vector );/建树 void build_PFTree(vector vector );/建表 void init_head_table(vector vector );/建树 vector vector generate_base_pattern(Item *);/生成基模式 bool is_signal_path(); /查看是否是单源树 void print_tree(Item *r); void delte_all(Item *root); void print_head_table(); ;/生成基模式/vector vector PFTree:generate_base_pattern(Item * header) Item *tmp=header-node_link; vector vector ret; while(tmp) / if(!tmp-parent_link)break; Item* tt; vector inn; inn.push_back(tmp); tt=tmp-parent_link; / debug; while(tt) if(!tt-parent_link)break; inn.push_back(tt); tt=tt-parent_link; ret.push_back(inn); tmp=tmp-node_link; / coutinendl;/ print(ret); return ret; /*void PFTree:output_change() freopen(data.out,w,stdout);*/void PFTree:print_head_table() Item *tmp; int i,s=T.size(); for(i=0;is;i+) tmp=Ti; while(tmp) coutitem_name+:support_countnode_link; if(i!=s-1)cout; coutendl;/ 建立表头 /void PFTree:init_head_table(vector vector ssvec) int i,j; vector svec; for(i=0;issvec.size();i+) svec=ssveci; for(j=0;jsupport_count=1; n-item_name=svecj ; n-node_link=NULL; head_tablesvecj=n; else head_tablesvecj-support_count+; map: iterator it; for(it=head_table.begin();it!=head_table.end();it+) if(Item *)it-second)-support_count=min_support) T.push_back(Item *)(*it).second); sort(T.begin(),T.end(),comp); / 建树/ void PFTree:build_PFTree(vector vector svec) root=new Item; Item * t; root-parent_link=NULL; vector vec; Item * tmp100; int tmp_size; for(int i=0;isvec.size();i+) vec=sveci; tmp_size=0; for(int j=0;jsupport_count=min_support) tmp+tmp_size=t; if(tmp_size=0)continue; sort(tmp+1,tmp+tmp_size+1,comp); insert(root,tmp,1,tmp_size,1); / /查看是否是单源树/bool PFTree:is_signal_path() Item * tmp=root; while(tmp) if(tmp-child_link.size()1)return false; if(tmp-child_link.size()=0)break; tmp=( tmp-child_link.begin() )-second; return true;/打印树/void PFTree:print_tree(Item * r) Item * tmp; map: iterator it; for(it=r-child_link.begin(); it!=r-child_link.end(); it+) tmp=(Item *)(it-second); cout;coutitem_name:support_countchild_link).size()0) cout; print_tree(tmp); coutchild_link.size()=0) delete r;return ; map: iterator it; for(it=r-child_link.begin(); it!=r-child_link.end(); it+) delte_all(it-second); delete r; void PFTree:init_head_table( vector vector ssvec) int i,j,sup; vector svec; if(ssvec.size()1) for(i=0;isupport_count; for(i=0;isupport_count; for(j=1;jitem_name) Item * n=new Item(); n-item_name=svecj-item_name; n-support_count=sup; n-node_link=NULL; head_tablesvecj-item_name=n; else head_tablesvecj-item_name-support_count+=sup; map: iterator it; for(it=head_table.begin();it!=head_table.end();it+) Item * tt=it-second; if(tt-support_count=min_support) T.push_back(tt);

展开阅读全文

数据挖掘课后作业

最新文档