recommend.py

# =============================================================================
# AUSTRALIAN NATIONAL UNIVERSITY OPEN SOURCE LICENSE (ANUOS LICENSE)
# VERSION 1.3
# 
# The contents of this file are subject to the ANUOS License Version 1.2
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at:
# 
#   http://datamining.anu.edu.au/linkage.html
# 
# Software distributed under the License is distributed on an "AS IS"
# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
# the License for the specific language governing rights and limitations
# under the License.
# 
# The Original Software is: "test.py"
# 
# The Initial Developers of the Original Software are:
#   Peter Christen
# 
# Copyright (C) 2002 - 2011 the Australian National University and
# others. All Rights Reserved.
# 
# Contributors:
# 
# Alternatively, the contents of this file may be used under the terms
# of the GNU General Public License Version 2 or later (the "GPL"), in
# which case the provisions of the GPL are applicable instead of those
# above. The GPL is available at the following URL: http://www.gnu.org/
# If you wish to allow use of your version of this file only under the
# terms of the GPL, and not to allow others to use your version of this
# file under the terms of the ANUOS License, indicate your decision by
# deleting the provisions above and replace them with the notice and
# other provisions required by the GPL. If you do not delete the
# provisions above, a recipient may use your version of this file under
# the terms of any one of the ANUOS License or the GPL.
# =============================================================================

# =============================================================================
# Start of Febrl project module: "test.py"
#
# Generated using "guiFebrl.py" on Fri Nov  9 16:43:05 2012
# =============================================================================

# Import necessary modules (Python standard modules first, then Febrl modules)

import logging

import classification
import comparison
import dataset
import encode
import indexing
import measurements
import mymath
import output
import stringcmp

# -----------------------------------------------------------------------------
# Intialise a logger, set level to info oe warning
#
log_level = logging.INFO # logging.WARNING

my_logger = logging.getLogger()
my_logger.setLevel(log_level)

# -----------------------------------------------------------------------------
# Febrl project type: Deduplicate
# -----------------------------------------------------------------------------

# -----------------------------------------------------------------------------

# Define input data set A:
#
data_set_a = dataset.DataSetCSV(description="Data set generated by Febrl GUI",
                                access_mode="read",
                                strip_fields=True,
                                miss_val=[''],
                                rec_ident="ID",
                                file_name="/home/jclark/projects/dpla_appfest/test_sample.csv",
                                header_line=True,
                                delimiter=",",
                                field_list = [("ID",0),
                                              ("Title",1),
                                              ("Creator",2),
                                              ("Subject",3),
                                              ("Publisher",4),
                                              ("Description",5),
                                              ("Type",6)])


# -----------------------------------------------------------------------------

# Define field comparison functions
#
fc_funct_1 = comparison.FieldComparatorWinkler(agree_weight = 1.0,
                                               description = "Winkler-Subject-Subject",
                                               disagree_weight = 0.0,
                                               missing_weight = 0.0,
                                               threshold = 0.0,
                                               check_sim = True,
                                               check_init = True,
                                               check_long = True)

fc_funct_2 = comparison.FieldComparatorWinkler(agree_weight = 1.0,
                                               description = "Winkler-Creator-Creator",
                                               disagree_weight = 0.0,
                                               missing_weight = 0.0,
                                               threshold = 0.0,
                                               check_sim = True,
                                               check_init = True,
                                               check_long = True)

fc_funct_3 = comparison.FieldComparatorWinkler(agree_weight = 1.0,
                                               description = "Winkler-Title-Title",
                                               disagree_weight = 0.0,
                                               missing_weight = 0.0,
                                               threshold = 0.0,
                                               check_sim = True,
                                               check_init = True,
                                               check_long = True)

fc_funct_4 = comparison.FieldComparatorWinkler(agree_weight = 1.0,
                                               description = "Winkler-Description-Description",
                                               disagree_weight = 0.0,
                                               missing_weight = 0.0,
                                               threshold = 0.0,
                                               check_sim = True,
                                               check_init = True,
                                               check_long = True)

field_comp_list = [(fc_funct_1, "Subject", "Subject"),
                   (fc_funct_2, "Creator", "Creator"),
                   (fc_funct_3, "Title", "Title"),
                   (fc_funct_4, "Description", "Description")]

rec_comp = comparison.RecordComparator(data_set_a, data_set_a, field_comp_list)

# -----------------------------------------------------------------------------

# Define indices for "blocking"
#
index = indexing.FullIndex(dataset1 = data_set_a,
                           dataset2 = data_set_a,
                           weight_vec_file = "/home/jclark/projects/dpla_appfest/match_weights",
                           progress_report = 1,
                           rec_comparator = rec_comp,
                           index_sep_str = "",
                           skip_missing = True,
                           index_def = [])

# Build and compact index
#
index.build()

index.compact()

# Do record pair comparisons
#
index.run()

[field_names_list, w_vec_dict] = output.LoadWeightVectorFile("/home/jclark/projects/dpla_appfest/match_weights")

# -----------------------------------------------------------------------------

# Define weight vector (record pair) classifier
#
classifier = classification.FellegiSunter(lower_threshold = 2.85,
                                          upper_threshold = 4)

# Unsupervised training of classifier
#
class_w_vec_dict = w_vec_dict  # Use orignal weight vector dictionary

classifier.train(class_w_vec_dict, set(), set())

# Classify all weight vectors
#
[m_set, nm_set, pm_set] = classifier.classify(class_w_vec_dict)

# -----------------------------------------------------------------------------

# Define output file options
#
histo_str_list = output.GenerateHistogram(class_w_vec_dict, 1.0)

for line in histo_str_list:
  print line

# =============================================================================
# End of Febrl project module: "test.py"
# =============================================================================