openrefine-client/tests/test_tutorial.py

475 lines
20 KiB
Python

#!/usr/bin/env python
"""
test_tutorial.py
The tests here are based on David Huynh's Refine tutorial at
http://davidhuynh.net/spaces/nicar2011/tutorial.pdf The tests perform all the
Refine actions given in the tutorial (except the web scraping) and verify the
changes expected to be observed explained in the tutorial.
These tests require a connection to a Refine server either at
http://127.0.0.1:3333/ or by specifying environment variables
GOOGLE_REFINE_HOST and GOOGLE_REFINE_PORT.
"""
# Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
import unittest
from google.refine import facet
from tests import refinetest
class TutorialTestFacets(refinetest.RefineTestCase):
project_file = 'louisiana-elected-officials.csv'
def test_get_rows(self):
# Section "2. Exploration using Facets": {3}
response = self.project.get_rows(limit=10)
self.assertEqual(len(response.rows), 10)
self.assertEqual(response.limit, 10)
self.assertEqual(response.total, 6958)
self.assertEqual(response.filtered, 6958)
for row in response.rows:
self.assertFalse(row.flagged)
self.assertFalse(row.starred)
def test_facet(self):
# Section "2. Exploration using Facets": {4}
party_code_facet = facet.TextFacet(column='Party Code')
response = self.project.compute_facets(party_code_facet)
pc = response.facets[0]
# test look by index same as look up by facet object
self.assertEqual(pc, response.facets[party_code_facet])
self.assertEqual(pc.name, 'Party Code')
self.assertEqual(pc.choices['D'].count, 3700)
self.assertEqual(pc.choices['N'].count, 15)
self.assertEqual(pc.blank_choice.count, 1446)
# {5}, {6}
engine = facet.Engine(party_code_facet)
ethnicity_facet = facet.TextFacet(column='Ethnicity')
engine.add_facet(ethnicity_facet)
self.project.engine = engine
response = self.project.compute_facets()
e = response.facets[ethnicity_facet]
self.assertEqual(e.choices['B'].count, 1255)
self.assertEqual(e.choices['W'].count, 4469)
# {7}
ethnicity_facet.include('B')
response = self.project.get_rows()
self.assertEqual(response.filtered, 1255)
indexes = [row.index for row in response.rows]
self.assertEqual(indexes, [1, 2, 3, 4, 6, 12, 18, 26, 28, 32])
# {8}
response = self.project.compute_facets()
pc = response.facets[party_code_facet]
self.assertEqual(pc.name, 'Party Code')
self.assertEqual(pc.choices['D'].count, 1179)
self.assertEqual(pc.choices['R'].count, 11)
self.assertEqual(pc.blank_choice.count, 46)
# {9}
party_code_facet.include('R')
response = self.project.compute_facets()
e = response.facets[ethnicity_facet]
self.assertEqual(e.choices['B'].count, 11)
# {10}
party_code_facet.reset()
ethnicity_facet.reset()
response = self.project.get_rows()
self.assertEqual(response.filtered, 6958)
# {11}
office_title_facet = facet.TextFacet('Office Title')
self.project.engine.add_facet(office_title_facet)
response = self.project.compute_facets()
self.assertEqual(len(response.facets[2].choices), 76)
# {12} - XXX not sure how to interpret bins & baseBins yet
office_level_facet = facet.NumericFacet('Office Level')
self.project.engine.add_facet(office_level_facet)
# {13}
office_level_facet.From = 300 # from reserved word
office_level_facet.to = 320
response = self.project.get_rows()
self.assertEqual(response.filtered, 1907)
response = self.project.compute_facets()
ot = response.facets[office_title_facet]
self.assertEqual(len(ot.choices), 21)
self.assertEqual(ot.choices['Chief of Police'].count, 2)
self.assertEqual(ot.choices['Chief of Police '].count, 211)
# {14}
self.project.engine.remove_all()
response = self.project.get_rows()
self.assertEqual(response.filtered, 6958)
# {15}
phone_facet = facet.TextFacet('Phone', expression='value[0, 3]')
self.project.engine.add_facet(phone_facet)
response = self.project.compute_facets()
p = response.facets[phone_facet]
self.assertEqual(p.expression, 'value[0, 3]')
self.assertEqual(p.choices['318'].count, 2331)
# {16}
commissioned_date_facet = facet.NumericFacet('Commissioned Date',
expression='value.toDate().datePart("year")')
self.project.engine.add_facet(commissioned_date_facet)
response = self.project.compute_facets()
cd = response.facets[commissioned_date_facet]
self.assertEqual(cd.error_count, 959)
self.assertEqual(cd.numeric_count, 5999)
# {17}
office_description_facet = facet.NumericFacet('Office Description',
expression=r'value.match(/\D*(\d+)\w\w Rep.*/)[0].toNumber()')
self.project.engine.add_facet(office_description_facet)
response = self.project.compute_facets()
od = response.facets[office_description_facet]
self.assertEqual(od.min, 0)
self.assertEqual(od.max, 110)
self.assertEqual(od.numeric_count, 548)
class TutorialTestEditing(refinetest.RefineTestCase):
project_file = 'louisiana-elected-officials.csv'
def test_editing(self):
# Section "3. Cell Editing": {1}
self.project.engine.remove_all() # redundant due to setUp
# {2}
self.project.text_transform(column='Zip Code 2',
expression='value.toString()[0, 5]')
self.assertInResponse('transform on 6067 cells in column Zip Code 2')
# {3} - XXX history
# {4}
office_title_facet = facet.TextFacet('Office Title')
self.project.engine.add_facet(office_title_facet)
response = self.project.compute_facets()
self.assertEqual(len(response.facets[office_title_facet].choices), 76)
self.project.text_transform('Office Title', 'value.trim()')
self.assertInResponse('6895')
response = self.project.compute_facets()
self.assertEqual(len(response.facets[office_title_facet].choices), 67)
# {5}
self.project.edit('Office Title', 'Councilmen', 'Councilman')
self.assertInResponse('13')
response = self.project.compute_facets()
self.assertEqual(len(response.facets[office_title_facet].choices), 66)
# {6}
response = self.project.compute_clusters('Office Title')
self.assertTrue(not response)
# {7}
clusters = self.project.compute_clusters('Office Title', 'knn')
self.assertEqual(len(clusters), 7)
first_cluster = clusters[0]
self.assertEqual(len(first_cluster), 2)
self.assertEqual(first_cluster[0]['value'], 'RSCC Member')
self.assertEqual(first_cluster[0]['count'], 233)
# Not strictly necessary to repeat 'Council Member' but a test
# of mass_edit, and it's also what the front end sends.
self.project.mass_edit('Office Title', [{
'from': ['Council Member', 'Councilmember'],
'to': 'Council Member'
}])
self.assertInResponse('372')
response = self.project.compute_facets()
self.assertEqual(len(response.facets[office_title_facet].choices), 65)
# Section "4. Row and Column Editing, Batched Row Deletion"
# Test doesn't strictly follow the tutorial as the "Browse this
# cluster" performs a text facet which the server can't complete
# as it busts its max facet count. The useful work is done with
# get_rows(). Also, we can facet & select in one; the UI can't.
# {1}, {2}, {3}, {4}
clusters = self.project.compute_clusters('Candidate Name')
for cluster in clusters[0:3]: # just do a few
for match in cluster:
# {2}
if match['value'].endswith(', '):
response = self.project.get_rows(
facet.TextFacet('Candidate Name', match['value']))
self.assertEqual(len(response.rows), 1)
for row in response.rows:
self.project.star_row(row)
self.assertInResponse(str(row.index + 1))
# {5}, {6}, {7}
response = self.project.compute_facets(facet.StarredFacet(True))
self.assertEqual(len(response.facets[0].choices), 2) # true & false
self.assertEqual(response.facets[0].choices[True].count, 3)
self.project.remove_rows()
self.assertInResponse('3 rows')
class TutorialTestDuplicateDetection(refinetest.RefineTestCase):
project_file = 'duplicates.csv'
def test_duplicate_detection(self):
# Section "4. Row and Column Editing,
# Duplicate Row Detection and Deletion"
# {7}, {8}
response = self.project.get_rows(sort_by='email')
indexes = [row.index for row in response.rows]
self.assertEqual(indexes, [4, 9, 8, 3, 0, 2, 5, 6, 1, 7])
# {9}
self.project.reorder_rows()
self.assertInResponse('Reorder rows')
response = self.project.get_rows()
indexes = [row.index for row in response.rows]
self.assertEqual(indexes, range(10))
# {10}
self.project.add_column('email', 'count',
'facetCount(value, "value", "email")')
self.assertInResponse('column email by filling 10 rows')
response = self.project.get_rows()
self.assertEqual(self.project.column_order['email'], 0) # i.e. 1st
self.assertEqual(self.project.column_order['count'], 1) # i.e. 2nd
counts = [row['count'] for row in response.rows]
self.assertEqual(counts, [2, 2, 1, 1, 3, 3, 3, 1, 2, 2])
# {11}
self.assertFalse(self.project.has_records)
self.project.blank_down('email')
self.assertInResponse('Blank down 4 cells')
self.assertTrue(self.project.has_records)
response = self.project.get_rows()
emails = [1 if row['email'] else 0 for row in response.rows]
self.assertEqual(emails, [1, 0, 1, 1, 1, 0, 0, 1, 1, 0])
# {12}
blank_facet = facet.BlankFacet('email', selection=True)
# {13}
self.project.remove_rows(blank_facet)
self.assertInResponse('Remove 4 rows')
self.project.engine.remove_all()
response = self.project.get_rows()
email_counts = [(row['email'], row['count']) for row in response.rows]
self.assertEqual(email_counts, [
(u'arthur.duff@example4.com', 2),
(u'ben.morisson@example6.org', 1),
(u'ben.tyler@example3.org', 1),
(u'danny.baron@example1.com', 3),
(u'jean.griffith@example5.org', 1),
(u'melanie.white@example2.edu', 2)
])
class TutorialTestTransposeColumnsIntoRows(refinetest.RefineTestCase):
project_file = 'us_economic_assistance.csv'
def test_transpose_columns_into_rows(self):
# Section "5. Structural Editing, Transpose Columns into Rows"
# {1}, {2}, {3}
self.project.transpose_columns_into_rows('FY1946', 64, 'pair')
self.assertInResponse('64 column(s) starting with FY1946')
# {4}
self.project.add_column('pair', 'year', 'value[2,6].toNumber()')
self.assertInResponse('filling 26185 rows')
# {5}
self.project.text_transform(column='pair',
expression='value.substring(7).toNumber()')
self.assertInResponse('transform on 26185 cells')
# {6}
self.project.rename_column('pair', 'amount')
self.assertInResponse('Rename column pair to amount')
# {7}
self.project.fill_down('country_name')
self.assertInResponse('Fill down 23805 cells')
self.project.fill_down('program_name')
self.assertInResponse('Fill down 23805 cells')
# spot check of last row for transforms and fill down
response = self.project.get_rows()
row10 = response.rows[9]
self.assertEqual(row10['country_name'], 'Afghanistan')
self.assertEqual(row10['program_name'],
'Department of Defense Security Assistance')
self.assertEqual(row10['amount'], 113777303)
class TutorialTestTransposeFixedNumbeOfRowsIntoColumns(
refinetest.RefineTestCase):
project_file = 'fixed-rows.csv'
project_file_options = {'split_into_columns': False,
'header_lines': 0}
def test_transpose_fixed_number_of_rows_into_columns(self):
# Section "5. Structural Editing,
# Transpose Fixed Number of Rows into Columns"
# {1}
self.assertTrue('Column' in self.project.column_order)
# {8}
self.project.transpose_rows_into_columns('Column', 4)
self.assertInResponse('Transpose every 4 cells in column Column')
# {9} - renaming column triggers a bug in Refine
# {10}
self.project.add_column('Column 1', 'Transaction',
'if(value.contains(" sent "), "send", "receive")')
self.assertInResponse('Column 1 by filling 4 rows')
# {11}
transaction_facet = facet.TextFacet(column='Transaction',
selection='send')
self.project.engine.add_facet(transaction_facet)
self.project.compute_facets()
# {12}, {13}, {14}
self.project.add_column('Column 1', 'Sender',
'value.partition(" sent ")[0]')
# XXX resetting the facet shows data in rows with Transaction=receive
# which shouldn't have been possible with the facet.
self.project.add_column('Column 1', 'Recipient',
'value.partition(" to ")[2].partition(" on ")[0]')
self.project.add_column('Column 1', 'Amount',
'value.partition(" sent ")[2].partition(" to ")[0]')
# {15}
transaction_facet.reset().include('receive')
response = self.project.get_rows()
# XXX there seems to be some kind of bug where the model doesn't
# match get_rows() output - cellIndex being returned that are
# out of range.
#self.assertTrue(a_row['Sender'] is None)
#self.assertTrue(a_row['Recipient'] is None)
#self.assertTrue(a_row['Amount'] is None)
# {16}
for column, expression in (
('Sender',
'cells["Column 1"].value.partition(" from ")[2]'
'.partition(" on ")[0]'),
('Recipient',
'cells["Column 1"].value.partition(" received ")[0]'),
('Amount',
'cells["Column 1"].value.partition(" received ")[2]'
'.partition(" from ")[0]')
):
self.project.text_transform(column, expression)
self.assertInResponse('2 cells')
# {17}
transaction_facet.reset()
# {18}
self.project.text_transform('Column 1', 'value.partition(" on ")[2]')
self.assertInResponse('4 cells')
# {19}
self.project.reorder_columns(['Transaction', 'Amount', 'Sender',
'Recipient'])
self.assertInResponse('Reorder columns')
class TutorialTestTransposeVariableNumbeOfRowsIntoColumns(
refinetest.RefineTestCase):
project_file = 'variable-rows.csv'
project_file_options = {'split_into_columns': False,
'header_lines': 0}
def test_transpose_variable_number_of_rows_into_columns(self):
# {20}, {21}
self.project.add_column('Column', 'First Line',
'if(value.contains(" on "), value, null)')
self.assertInResponse('Column by filling 4 rows')
response = self.project.get_rows()
first_names = [row['First Line'][0:10] if row['First Line'] else None
for row in response.rows]
self.assertEqual(first_names, ['Tom Dalton', None, None, None,
'Morgan Law', None, None, None, None, 'Eric Batem'])
# {22}
self.project.move_column('First Line', 0)
self.assertInResponse('Move column First Line to position 0')
self.assertEqual(self.project.column_order['First Line'], 0)
# {23}
self.project.engine.mode = 'record-based'
response = self.project.get_rows()
self.assertEqual(response.mode, 'record-based')
self.assertEqual(response.filtered, 4)
# {24}
self.project.add_column('Column', 'Status',
'row.record.cells["Column"].value[-1]')
self.assertInResponse('filling 18 rows')
# {25}
self.project.text_transform('Column',
'row.record.cells["Column"].value[1, -1].join("|")')
self.assertInResponse('18 cells')
# {26}
self.project.engine.mode = 'row-based'
# {27}
blank_facet = facet.BlankFacet('First Line', selection=True)
self.project.remove_rows(blank_facet)
self.assertInResponse('Remove 14 rows')
self.project.engine.remove_all()
# {28}
self.project.split_column('Column', separator='|')
self.assertInResponse('Split 4 cell(s) in column Column')
class TutorialTestWebScraping(refinetest.RefineTestCase):
project_file = 'eli-lilly.csv'
filter_expr_1 = """
forEach(
value[2,-2].replace(" ", " ").split("), ("),
v,
v[0,-1].partition(", '", true).join(":")
).join("|")
"""
filter_expr_2 = """
filter(
value.split("|"), p, p.partition(":")[0].toNumber() == %d
)[0].partition(":")[2]
"""
def test_web_scraping(self):
# Section "6. Web Scraping"
# {1}, {2}
self.project.split_column('key', separator=':')
self.assertInResponse('Split 5409 cell(s) in column key')
self.project.rename_column('key 1', 'page')
self.assertInResponse('Rename column key 1 to page')
self.project.rename_column('key 2', 'top')
self.assertInResponse('Rename column key 2 to top')
self.project.move_column('line', 'end')
self.assertInResponse('Move column line to position 2')
# {3}
self.project.sorting = facet.Sorting([
{'column': 'page', 'valueType': 'number'},
{'column': 'top', 'valueType': 'number'},
])
self.project.reorder_rows()
self.assertInResponse('Reorder rows')
first_row = self.project.get_rows(limit=1).rows[0]
self.assertEqual(first_row['page'], 1)
self.assertEqual(first_row['top'], 24)
# {4}
filter_facet = facet.TextFilterFacet('line', 'ahman')
rows = self.project.get_rows(filter_facet).rows
self.assertEqual(len(rows), 1)
self.assertEqual(rows[0]['top'], 106)
filter_facet.query = 'alvarez'
rows = self.project.get_rows().rows
self.assertEqual(len(rows), 2)
self.assertEqual(rows[-1]['top'], 567)
self.project.engine.remove_all()
# {5} - tutorial says 'line'; it means 'top'
line_facet = facet.NumericFacet('top')
line_facet.to = 100
self.project.remove_rows(line_facet)
self.assertInResponse('Remove 775 rows')
line_facet.From = 570
line_facet.to = 600
self.project.remove_rows(line_facet)
self.assertInResponse('Remove 71 rows')
line_facet.reset()
response = self.project.get_rows()
self.assertEqual(response.filtered, 4563)
# {6}
page_facet = facet.TextFacet('page', 1) # 1 not '1'
self.project.engine.add_facet(page_facet)
# {7}
rows = self.project.get_rows().rows
# Look for a row with a name in it by skipping HTML
name_row = [row for row in rows if '<b>' not in row['line']][0]
self.assertTrue('WELLNESS' in name_row['line'])
self.assertEqual(name_row['top'], 161)
line_facet.From = 20
line_facet.to = 160
self.project.remove_rows()
self.assertInResponse('Remove 9 rows')
self.project.engine.remove_all()
# {8}
self.project.text_transform('line', expression=self.filter_expr_1)
self.assertInResponse('Text transform on 4554 cells in column line')
# {9} - XXX following is generating Java exceptions
#filter_expr = self.filter_expr_2 % 16
#self.project.add_column('line', 'Name', expression=filter_expr)
# {10} to the final {19} - nothing new in terms of exercising the API.
if __name__ == '__main__':
unittest.main()