openrefine-client/google/test/test_tutorial.py

468 lines
20 KiB
Python

#!/usr/bin/env python
"""
test_refine.py
The tests here are based on David Huynh's Refine tutorial at
http://davidhuynh.net/spaces/nicar2011/tutorial.pdf The tests perform all
the Refine actions given in the tutorial (except the web scraping) and verify
the changes expected to observe explained in the tutorial.
These tests require a connection to a Refine server either at
http://127.0.0.1:3333/ or by specifying environment variables REFINE_HOST
and REFINE_PORT.
"""
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
import os
import unittest
from google.refine import refine
from google.refine import facet
PATH_TO_TEST_DATA = os.path.join('google', 'test', 'data')
class RefineTestCase(unittest.TestCase):
project_file = None
project_file_options = {}
project = None
# Section "2. Exploration using Facets": {1}, {2}
def setUp(self):
self.server = refine.RefineServer()
self.refine = refine.Refine(self.server)
if self.project_file:
self.project = self.refine.new_project(
os.path.join(PATH_TO_TEST_DATA, self.project_file),
**self.project_file_options)
def tearDown(self):
if self.project:
self.project.delete()
self.project = None
class RefineServerTest(RefineTestCase):
def test_init(self):
self.assertEqual(self.server.server,
'http://%s:%s' % (refine.REFINE_HOST, refine.REFINE_PORT))
server = refine.RefineServer('http://refine.example/')
self.assertEqual(server.server, 'http://refine.example')
def test_list_projects(self):
projects = self.refine.list_projects()
self.assertTrue(isinstance(projects, dict))
def test_get_version(self):
version_info = self.refine.get_version()
for item in ('revision', 'version', 'full_version', 'full_name'):
self.assertTrue(item in version_info)
class RefineTest(RefineTestCase):
project_file = 'duplicates.csv'
def test_new_project(self):
self.assertTrue(isinstance(self.project, refine.RefineProject))
def test_get_models(self):
self.assertEqual(self.project.key_column, 'email')
self.assertTrue('email' in self.project.column_order)
self.assertEqual(self.project.column_order['name'], 1)
def test_delete_project(self):
self.assertTrue(self.project.delete())
class TutorialTestFacets(RefineTestCase):
project_file = 'louisiana-elected-officials.csv'
def test_get_rows(self):
# Section "2. Exploration using Facets": {3}
response = self.project.get_rows(limit=10)
self.assertEqual(len(response.rows), 10)
self.assertEqual(response.limit, 10)
self.assertEqual(response.total, 6958)
for row in response.rows:
self.assertFalse(row.flagged)
self.assertFalse(row.starred)
def test_facet(self):
# Section "2. Exploration using Facets": {4}
party_code_facet = facet.TextFacet(column='Party Code')
response = self.project.compute_facets(party_code_facet)
pc = response.facets[0]
self.assertEqual(pc.name, 'Party Code')
self.assertEqual(pc.choices['D'].count, 3700)
self.assertEqual(pc.choices['N'].count, 15)
self.assertEqual(pc.blank_choice.count, 1446)
# {5}, {6}
engine = facet.Engine(party_code_facet)
ethnicity_facet = facet.TextFacet(column='Ethnicity')
engine.add_facet(ethnicity_facet)
self.project.engine = engine
response = self.project.compute_facets()
e = response.facets[1]
self.assertEqual(e.choices['B'].count, 1255)
self.assertEqual(e.choices['W'].count, 4469)
# {7}
ethnicity_facet.include('B')
response = self.project.get_rows()
self.assertEqual(response.filtered, 1255)
indexes = [r.index for r in response.rows]
self.assertEqual(indexes, [1, 2, 3, 4, 6, 12, 18, 26, 28, 32])
# {8}
response = self.project.compute_facets()
pc = response.facets[0]
self.assertEqual(pc.name, 'Party Code')
self.assertEqual(pc.choices['D'].count, 1179)
self.assertEqual(pc.choices['R'].count, 11)
self.assertEqual(pc.blank_choice.count, 46)
# {9}
party_code_facet.include('R')
response = self.project.compute_facets()
e = response.facets[1]
self.assertEqual(e.choices['B'].count, 11)
# {10}
party_code_facet.reset()
ethnicity_facet.reset()
response = self.project.get_rows()
self.assertEqual(response.filtered, 6958)
# {11}
office_title_facet = facet.TextFacet('Office Title')
self.project.engine.add_facet(office_title_facet)
response = self.project.compute_facets()
self.assertEqual(len(response.facets[2].choices), 76)
# {12} - XXX not sure how to interpret bins & baseBins yet
office_level_facet = facet.NumericFacet('Office Level')
self.project.engine.add_facet(office_level_facet)
# {13}
office_level_facet.From = 300 # from reserved word
office_level_facet.to = 320
response = self.project.get_rows()
self.assertEqual(response.filtered, 1907)
response = self.project.compute_facets()
ot = response.facets[2] # Office Title
self.assertEqual(len(ot.choices), 21)
self.assertEqual(ot.choices['Chief of Police'].count, 2)
self.assertEqual(ot.choices['Chief of Police '].count, 211)
# {14}
self.project.engine.remove_all()
response = self.project.get_rows()
self.assertEqual(response.filtered, 6958)
# {15}
phone_facet = facet.TextFacet('Phone', expression='value[0, 3]')
self.project.engine.add_facet(phone_facet)
response = self.project.compute_facets()
p = response.facets[0]
self.assertEqual(p.expression, 'value[0, 3]')
self.assertEqual(p.choices['318'].count, 2331)
# {16}
commissioned_date_facet = facet.NumericFacet('Commissioned Date',
expression='value.toDate().datePart("year")')
self.project.engine.add_facet(commissioned_date_facet)
response = self.project.compute_facets()
cd = response.facets[1]
self.assertEqual(cd.error_count, 959)
self.assertEqual(cd.numeric_count, 5999)
# {17}
office_description_facet = facet.NumericFacet('Office Description',
expression=r'value.match(/\D*(\d+)\w\w Rep.*/)[0].toNumber()')
self.project.engine.add_facet(office_description_facet)
response = self.project.compute_facets()
cd = response.facets[2]
self.assertEqual(cd.min, 0)
self.assertEqual(cd.max, 110)
self.assertEqual(cd.numeric_count, 548)
class TutorialTestEditing(RefineTestCase):
project_file = 'louisiana-elected-officials.csv'
def test_editing(self):
# Section "3. Cell Editing": {1}
self.project.engine.remove_all() # redundant due to setUp
# {2}
response = self.project.text_transform(column='Zip Code 2',
expression='value.toString()[0, 5]')
self.assertTrue('6067' in response['historyEntry']['description'])
# {3} - XXX history
# {4}
office_title_facet = facet.TextFacet('Office Title')
self.project.engine.add_facet(office_title_facet)
response = self.project.compute_facets()
self.assertEqual(len(response.facets[0].choices), 76)
response = self.project.text_transform('Office Title', 'value.trim()')
self.assertTrue('6895' in response['historyEntry']['description'])
response = self.project.compute_facets()
self.assertEqual(len(response.facets[0].choices), 67)
# {5}
response = self.project.edit('Office Title',
'Councilmen', 'Councilman')
self.assertTrue('13' in response['historyEntry']['description'])
response = self.project.compute_facets()
self.assertEqual(len(response.facets[0].choices), 66)
# {6}
response = self.project.compute_clusters('Office Title')
self.assertTrue(not response)
# {7}
clusters = self.project.compute_clusters('Office Title', 'knn')
self.assertEqual(len(clusters), 7)
self.assertEqual(len(clusters[0]), 2)
self.assertEqual(clusters[0][0]['value'], 'RSCC Member')
self.assertEqual(clusters[0][0]['count'], 233)
# Not strictly necessary to repeat 'Council Member' but a test
# of mass_edit, and it's also what the front end sends.
response = self.project.mass_edit('Office Title', [{
'from': ['Council Member', 'Councilmember'],
'to': 'Council Member'
}])
self.assertTrue('372' in response['historyEntry']['description'])
response = self.project.compute_facets()
self.assertEqual(len(response.facets[0].choices), 65)
# Section "4. Row and Column Editing, Batched Row Deletion"
# Test doesn't strictly follow the tutorial as the "Browse this
# cluster" performs a text facet which the server can't complete
# as it busts its max facet count. The useful work is done with
# get_rows(). Also, we can facet & select in one; the UI can't.
# {1}, {2}, {3}, {4}
clusters = self.project.compute_clusters('Candidate Name')
for cluster in clusters[0:3]: # just do a few
for match in cluster:
# {2}
if match['value'].endswith(', '):
response = self.project.get_rows(
facet.TextFacet('Candidate Name', match['value']))
self.assertEqual(len(response.rows), 1)
for row in response.rows:
response = self.project.star_row(row)
self.assertTrue(str(row.index + 1) in
response['historyEntry']['description'])
# {5}, {6}, {7}
response = self.project.compute_facets(facet.StarredFacet(True))
self.assertEqual(len(response.facets[0].choices), 2) # true & false
self.assertEqual(response.facets[0].choices[True].count, 3)
response = self.project.remove_rows()
self.assertTrue('3 rows' in response['historyEntry']['description'])
class TutorialTestDuplicateDetection(RefineTestCase):
project_file = 'duplicates.csv'
def test_duplicate_detection(self):
# Section "4. Row and Column Editing,
# Duplicate Row Detection and Deletion"
# {7}, {8}
response = self.project.get_rows(sort_by='email')
indexes = [r.index for r in response.rows]
self.assertEqual(indexes, [4, 9, 8, 3, 0, 2, 5, 6, 1, 7])
# {9}
response = self.project.reorder_rows()
self.assertEqual('Reorder rows',
response['historyEntry']['description'])
response = self.project.get_rows()
indexes = [r.index for r in response.rows]
self.assertEqual(indexes, range(10))
# {10}
response = self.project.add_column('email', 'count',
'facetCount(value, "value", "email")')
self.assertTrue('column email by filling 10 rows' in
response['historyEntry']['description'])
response = self.project.get_rows()
self.assertEqual(self.project.column_order['email'], 0) # i.e. 1st
self.assertEqual(self.project.column_order['count'], 1) # i.e. 2nd
counts = [r['count'] for r in response.rows]
self.assertEqual(counts, [2, 2, 1, 1, 3, 3, 3, 1, 2, 2])
# {11}
self.assertFalse(self.project.has_records)
response = self.project.blank_down('email')
self.assertTrue('Blank down 4 cells' in
response['historyEntry']['description'])
self.assertTrue(self.project.has_records)
response = self.project.get_rows()
emails = [1 if r['email'] else 0 for r in response.rows]
self.assertEqual(emails, [1, 0, 1, 1, 1, 0, 0, 1, 1, 0])
# {12}
blank_facet = facet.BlankFacet('email', selection=True)
# {13}
response = self.project.remove_rows(blank_facet)
self.assertTrue('Remove 4 rows' in
response['historyEntry']['description'])
self.project.engine.remove_all()
response = self.project.get_rows()
email_counts = [(row['email'], row['count']) for row in response.rows]
self.assertEqual(email_counts, [
(u'arthur.duff@example4.com', 2),
(u'ben.morisson@example6.org', 1),
(u'ben.tyler@example3.org', 1),
(u'danny.baron@example1.com', 3),
(u'jean.griffith@example5.org', 1),
(u'melanie.white@example2.edu', 2)
])
class TutorialTestTransposeColumnsIntoRows(RefineTestCase):
project_file = 'us_economic_assistance.csv'
def test_transpose_columns_into_rows(self):
# Section "5. Structural Editing, Transpose Columns into Rows"
# {1}, {2}, {3}
response = self.project.transpose_columns_into_rows(
'FY1946', 64, 'pair')
self.assertTrue('64 column(s) starting with FY1946' in
response['historyEntry']['description'])
# {4}
response = self.project.add_column('pair', 'year',
'value[2,6].toNumber()')
self.assertTrue('filling 26185 rows' in
response['historyEntry']['description'])
# {5}
response = self.project.text_transform(column='pair',
expression='value.substring(7).toNumber()')
self.assertTrue('transform on 26185 cells' in
response['historyEntry']['description'])
# {6}
response = self.project.rename_column('pair', 'amount')
self.assertTrue('Rename column pair to amount' in
response['historyEntry']['description'])
# {7}
response = self.project.fill_down('country_name')
self.assertTrue('Fill down 23805 cells' in
response['historyEntry']['description'])
response = self.project.fill_down('program_name')
self.assertTrue('Fill down 23805 cells' in
response['historyEntry']['description'])
# spot check of last row for transforms and fill down
response = self.project.get_rows()
row10 = response.rows[9]
self.assertEqual(row10['country_name'], 'Afghanistan')
self.assertEqual(row10['program_name'],
'Department of Defense Security Assistance')
self.assertEqual(row10['amount'], 113777303)
class TutorialTestTransposeFixedNumbeOfRowsIntoColumns(RefineTestCase):
project_file = 'fixed-rows.csv'
project_file_options = {'split_into_columns': False,
'header_lines': 0}
def test_transpose_fixed_number_of_rows_into_columns(self):
# Section "5. Structural Editing,
# Transpose Fixed Number of Rows into Columns"
# {1}
self.assertTrue('Column' in self.project.column_order)
# {8}
response = self.project.transpose_rows_into_columns('Column', 4)
self.assertTrue('Transpose every 4 cells in column Column' in
response['historyEntry']['description'])
# {9} - renaming column triggers a bug in Refine
# {10}
response = self.project.add_column('Column 1', 'Transaction',
'if(value.contains(" sent "), "send", "receive")')
self.assertTrue('Column 1 by filling 4 rows' in
response['historyEntry']['description'])
# {11}
transaction_facet = facet.TextFacet(column='Transaction',
selection='send')
self.project.engine.add_facet(transaction_facet)
self.project.compute_facets()
# {12}, {13}, {14}
response = self.project.add_column('Column 1', 'Sender',
'value.partition(" sent ")[0]')
# XXX resetting the facet shows data in rows with Transaction=receive
# which shouldn't have been possible with the facet.
response = self.project.add_column('Column 1', 'Recipient',
'value.partition(" to ")[2].partition(" on ")[0]')
response = self.project.add_column('Column 1', 'Amount',
'value.partition(" sent ")[2].partition(" to ")[0]')
# {15}
transaction_facet.reset().include('receive')
response = self.project.get_rows()
# XXX there seems to be some kind of bug where the model doesn't
# match get_rows() output - cellIndex being returned that are
# out of range.
#self.assertTrue(a_row['Sender'] is None)
#self.assertTrue(a_row['Recipient'] is None)
#self.assertTrue(a_row['Amount'] is None)
# {16}
for column, expression in (
('Sender',
'cells["Column 1"].value.partition(" from ")[2]'
'.partition(" on ")[0]'),
('Recipient',
'cells["Column 1"].value.partition(" received ")[0]'),
('Amount',
'cells["Column 1"].value.partition(" received ")[2]'
'.partition(" from ")[0]')
):
response = self.project.text_transform(column, expression)
self.assertTrue('2 cells' in
response['historyEntry']['description'])
# {17}
transaction_facet.reset()
# {18}
response = self.project.text_transform('Column 1',
'value.partition(" on ")[2]')
self.assertTrue('4 cells' in
response['historyEntry']['description'])
# {19}
response = self.project.reorder_columns([
'Transaction', 'Amount', 'Sender', 'Recipient'])
self.assertEqual('Reorder columns',
response['historyEntry']['description'])
class TutorialTestTransposeVariableNumbeOfRowsIntoColumns(RefineTestCase):
project_file = 'variable-rows.csv'
project_file_options = {'split_into_columns': False,
'header_lines': 0}
def test_transpose_variable_number_of_rows_into_columns(self):
# {20}, {21}
response = self.project.add_column('Column', 'First Line',
'if(value.contains(" on "), value, null)')
self.assertTrue('Column by filling 4 rows' in
response['historyEntry']['description'])
response = self.project.get_rows()
first_names = [row['First Line'][0:10] if row['First Line'] else None
for row in response.rows]
self.assertEqual(first_names, ['Tom Dalton', None, None, None,
'Morgan Law', None, None, None, None, 'Eric Batem'])
# {22}
response = self.project.move_column('First Line', 0)
self.assertTrue('Move column First Line to position 0' in
response['historyEntry']['description'])
self.assertEqual(self.project.column_order['First Line'], 0)
# {23}
self.project.engine.mode = 'record-based'
response = self.project.get_rows()
self.assertEqual(response.mode, 'record-based')
self.assertEqual(response.filtered, 4)
# {24}
response = self.project.add_column('Column', 'Status',
'row.record.cells["Column"].value[-1]')
self.assertTrue('filling 18 rows' in
response['historyEntry']['description'])
# {25}
response = self.project.text_transform('Column',
'row.record.cells["Column"].value[1, -1].join("|")')
self.assertTrue('18 cells' in
response['historyEntry']['description'])
# {26}
self.project.engine.mode = 'row-based'
# {27}
blank_facet = facet.BlankFacet('First Line', selection=True)
response = self.project.remove_rows(blank_facet)
self.assertEqual('Remove 14 rows',
response['historyEntry']['description'])
self.project.engine.remove_all()
# {28}
'Split 4 cell(s) in column Column into several columns by separator'
response = self.project.split_column('Column', separator='|')
self.assertTrue('Split 4 cell(s) in column Column' in
response['historyEntry']['description'])
if __name__ == '__main__':
unittest.main()