因为毕业设计的原因,简单研究了一下数据挖掘。 AprioriTid是数据挖掘关联规则方向的一个算法,用于找出具有最小支持度的频繁项目集。同时代码中也实现了根据频繁项目集找出具有最小确信度的关联规则的基本算法。 使用Ruby语言实现。
def simple_count(candidate_transaction, minsupp_count) support_count_of_itemset = {} support_count_of_itemset.default = 0 candidate_transaction.each_value { |transaction| transaction.each { |itemset| support_count_of_itemset[itemset] += 1 } } support_count_of_itemset.each { |itemset, support_count| support_count_of_itemset.delete(itemset) if support_count < minsupp_count } return support_count_of_itemset end
def aprioritid(database, minsupp) minsupp_count = minsupp * database.length itemsets_with_support = [{}] candidate_transaction_set_past = {} database.each { |tid, transaction| candidate_transaction_set_past[tid] = [] transaction.each { |item| candidate_transaction_set_past[tid] << [item] } } itemsets_with_support << simple_count(candidate_transaction_set_past, minsupp_count) k = 1 while itemsets_with_support[k].length != 0 candidate_transaction_set = {} candidate_itemsets = apriori_gen(itemsets_with_support[k].keys) break if candidate_itemsets.length == 0 candidate_transaction_set_past.each { |tid, itemsets_past| candidate_transaction_set[tid] = [] candidate_itemsets.each { |candidate_itemset| contain = true if contain itemsets_past_flatten = itemsets_past.flatten candidate_itemset.each { |candidate_item| contain &= itemsets_past_flatten.include?(candidate_item) } end if contain candidate_transaction_set[tid] << candidate_itemset end } } itemsets_with_support << simple_count(candidate_transaction_set, minsupp_count) k += 1 candidate_transaction_set_past.replace(candidate_transaction_set) end itemsets_with_support.each { |itemsets_with_support_for_each_pass| @flatten.merge!(itemsets_with_support_for_each_pass) } return itemsets_with_support end
def apriori_gen(itemsets) candidate_itemsets = [] itemsets.each { |itemset1| itemsets.each { |itemset2| if itemset1.length == 1 and itemset1[-1] < itemset2[-1] candidate_itemsets << [itemset1[-1], itemset2[-1]] elsif itemset2.length == 2 and itemset1[-1] < itemset2[-1] temp = [] temp.replace(itemset1) temp << itemset2[-1] contain = true itemset1.each_index { |index| check_temp = [] check_temp.replace(temp) check_temp.delete_at(index) contain &= itemsets.include?(check_temp) } candidate_itemsets << temp if contain and !candidate_itemsets.include?(temp) elsif itemset1[0..-2] == itemset2[0..-2] and itemset1[-1] < itemset2[-1] temp = [] temp.replace(itemset1) temp << itemset2[-1] contain = true if contain itemset1.each_index { |index| check_temp = [] check_temp.replace(temp) check_temp.delete_at(index).each { |check_temp_item| contain &= itemsets.include?(check_temp_item) } } end candidate_itemsets << temp if contain end } } return candidate_itemsets end
def find_out_rules(itemsets_with_support, minconf) rules = [] one_item_consequent = [] (2..(itemsets_with_support.length - 1)).each { |itemset_with_support_index| itemsets_with_support[itemset_with_support_index].each { |frequent_itemset, support| frequent_itemset.each_index { |frequent_item_index| temp = [] temp.replace(frequent_itemset) temp.delete_at(frequent_item_index) one_item_consequent << [frequent_itemset[frequent_item_index]] if support.to_f / @flatten[temp].to_f >= minconf rules << [temp, [frequent_itemset[frequent_item_index]]] } rules.concat(rules_gen(frequent_itemset, one_item_consequent, minconf)) } } return rules.uniq end
def rules_gen(frequent_itemset, m_item_consequent, minconf) rules = [] m1_item_consequent = [] if frequent_itemset.length > m_item_consequent[0].length + 1 m1_item_consequent = apriori_gen(m_item_consequent) m1_item_consequent.each_index { |consequent_index| temp = [] temp.replace(frequent_itemset) m1_item_consequent[consequent_index].each { |item| temp.delete(item) } if @flatten[frequent_itemset].to_f / @flatten[temp].to_f >= minconf rules << [temp, m1_item_consequent[consequent_index]] else m1_item_consequent.delete_at(consequent_index) end } rules.concat(rules_gen(frequent_itemset, m1_item_consequent, minconf)) end return rules end
@flatten = {}
require('postgres') puts('reading database at ' + Time.now.inspect) db = PGconn.new('localhost', 5432, '', '', 'pgsql', 'pgsql', '') result = db.exec("select id, name, gen, base, basegen from stock;") db.close database = {} id = 0 name = '' gen = 0.0 base = 0.0 basegen = 0.0 puts('converting at ' + Time.now.inspect) result.result.each_index { |index| id = result.result[index][0].to_i name = result.result[index][1].to_s gen = result.result[index][2].to_f base = result.result[index][3].to_f basegen = result.result[index][4].to_f database[index] = [id, name, gen, base, basegen] } support = 0.5 confidence = 0.2 puts('aprioritid at ' + Time.now.inspect) frequent_itemsets = aprioritid(database, support) puts('find rules at ' + Time.now.inspect) rules = find_out_rules(frequent_itemsets, confidence) rules.each { |rule| print('{') rule[0].each { |item| print(item) print(', ') } print("\b\b} => {") rule[1].each { |item| print(item) print(', ') } print("\b\b}\n") }
|