#!/usr/bin/env python """ test_tutorial.py The tests here are based on David Huynh's Refine tutorial at http://davidhuynh.net/spaces/nicar2011/tutorial.pdf The tests perform all the Refine actions given in the tutorial (except the web scraping) and verify the changes expected to be observed explained in the tutorial. These tests require a connection to a Refine server either at http://127.0.0.1:3333/ or by specifying environment variables OPENREFINE_HOST and OPENREFINE_PORT. """ # Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved. import unittest from google.refine import facet from tests import refinetest class TutorialTestFacets(refinetest.RefineTestCase): project_file = 'louisiana-elected-officials.csv' project_options = {'guess_cell_value_types': True} def test_get_rows(self): # Section "2. Exploration using Facets": {3} response = self.project.get_rows(limit=10) self.assertEqual(len(response.rows), 10) self.assertEqual(response.limit, 10) self.assertEqual(response.total, 6958) self.assertEqual(response.filtered, 6958) for row in response.rows: self.assertFalse(row.flagged) self.assertFalse(row.starred) def test_facet(self): # Section "2. Exploration using Facets": {4} party_code_facet = facet.TextFacet(column='Party Code') response = self.project.compute_facets(party_code_facet) pc = response.facets[0] # test look by index same as look up by facet object self.assertEqual(pc, response.facets[party_code_facet]) self.assertEqual(pc.name, 'Party Code') self.assertEqual(pc.choices['D'].count, 3700) self.assertEqual(pc.choices['N'].count, 15) self.assertEqual(pc.blank_choice.count, 1446) # {5}, {6} engine = facet.Engine(party_code_facet) ethnicity_facet = facet.TextFacet(column='Ethnicity') engine.add_facet(ethnicity_facet) self.project.engine = engine response = self.project.compute_facets() e = response.facets[ethnicity_facet] self.assertEqual(e.choices['B'].count, 1255) self.assertEqual(e.choices['W'].count, 4469) # {7} ethnicity_facet.include('B') response = self.project.get_rows() self.assertEqual(response.filtered, 1255) indexes = [row.index for row in response.rows] self.assertEqual(indexes, [1, 2, 3, 4, 6, 12, 18, 26, 28, 32]) # {8} response = self.project.compute_facets() pc = response.facets[party_code_facet] self.assertEqual(pc.name, 'Party Code') self.assertEqual(pc.choices['D'].count, 1179) self.assertEqual(pc.choices['R'].count, 11) self.assertEqual(pc.blank_choice.count, 46) # {9} party_code_facet.include('R') response = self.project.compute_facets() e = response.facets[ethnicity_facet] self.assertEqual(e.choices['B'].count, 11) # {10} party_code_facet.reset() ethnicity_facet.reset() response = self.project.get_rows() self.assertEqual(response.filtered, 6958) # {11} office_title_facet = facet.TextFacet('Office Title') self.project.engine.add_facet(office_title_facet) response = self.project.compute_facets() self.assertEqual(len(response.facets[2].choices), 76) # {12} - XXX not sure how to interpret bins & baseBins yet office_level_facet = facet.NumericFacet('Office Level') self.project.engine.add_facet(office_level_facet) # {13} office_level_facet.From = 300 # from reserved word office_level_facet.to = 320 response = self.project.get_rows() self.assertEqual(response.filtered, 1907) response = self.project.compute_facets() ot = response.facets[office_title_facet] self.assertEqual(len(ot.choices), 21) self.assertEqual(ot.choices['Chief of Police'].count, 2) self.assertEqual(ot.choices['Chief of Police '].count, 211) # {14} self.project.engine.remove_all() response = self.project.get_rows() self.assertEqual(response.filtered, 6958) # {15} phone_facet = facet.TextFacet('Phone', expression='value[0, 3]') self.project.engine.add_facet(phone_facet) response = self.project.compute_facets() p = response.facets[phone_facet] self.assertEqual(p.expression, 'value[0, 3]') self.assertEqual(p.choices['318'].count, 2331) # {16} commissioned_date_facet = facet.NumericFacet( 'Commissioned Date', expression='value.toDate().datePart("year")') self.project.engine.add_facet(commissioned_date_facet) response = self.project.compute_facets() cd = response.facets[commissioned_date_facet] self.assertEqual(cd.error_count, 959) self.assertEqual(cd.numeric_count, 5999) # {17} office_description_facet = facet.NumericFacet( 'Office Description', expression=r'value.match(/\D*(\d+)\w\w Rep.*/)[0].toNumber()') self.project.engine.add_facet(office_description_facet) response = self.project.compute_facets() od = response.facets[office_description_facet] self.assertEqual(od.min, 0) self.assertEqual(od.max, 110) self.assertEqual(od.numeric_count, 548) class TutorialTestEditing(refinetest.RefineTestCase): project_file = 'louisiana-elected-officials.csv' project_options = {'guess_cell_value_types': True} def test_editing(self): # Section "3. Cell Editing": {1} self.project.engine.remove_all() # redundant due to setUp # {2} self.project.text_transform(column='Zip Code 2', expression='value.toString()[0, 5]') self.assertInResponse('transform on 6067 cells in column Zip Code 2') # {3} - XXX history # {4} office_title_facet = facet.TextFacet('Office Title') self.project.engine.add_facet(office_title_facet) response = self.project.compute_facets() self.assertEqual(len(response.facets[office_title_facet].choices), 76) self.project.text_transform('Office Title', 'value.trim()') self.assertInResponse('6895') response = self.project.compute_facets() self.assertEqual(len(response.facets[office_title_facet].choices), 67) # {5} self.project.edit('Office Title', 'Councilmen', 'Councilman') self.assertInResponse('13') response = self.project.compute_facets() self.assertEqual(len(response.facets[office_title_facet].choices), 66) # {6} response = self.project.compute_clusters('Office Title') self.assertTrue(not response) # {7} clusters = self.project.compute_clusters('Office Title', 'knn') self.assertEqual(len(clusters), 7) first_cluster = clusters[0] self.assertEqual(len(first_cluster), 2) self.assertEqual(first_cluster[0]['value'], 'RSCC Member') self.assertEqual(first_cluster[0]['count'], 233) # Not strictly necessary to repeat 'Council Member' but a test # of mass_edit, and it's also what the front end sends. self.project.mass_edit('Office Title', [{ 'from': ['Council Member', 'Councilmember'], 'to': 'Council Member' }]) self.assertInResponse('372') response = self.project.compute_facets() self.assertEqual(len(response.facets[office_title_facet].choices), 65) # Section "4. Row and Column Editing, Batched Row Deletion" # Test doesn't strictly follow the tutorial as the "Browse this # cluster" performs a text facet which the server can't complete # as it busts its max facet count. The useful work is done with # get_rows(). Also, we can facet & select in one; the UI can't. # {1}, {2}, {3}, {4} clusters = self.project.compute_clusters('Candidate Name') for cluster in clusters[0:3]: # just do a few for match in cluster: # {2} if match['value'].endswith(', '): response = self.project.get_rows( facet.TextFacet('Candidate Name', match['value'])) self.assertEqual(len(response.rows), 1) for row in response.rows: self.project.star_row(row) self.assertInResponse(str(row.index + 1)) # {5}, {6}, {7} response = self.project.compute_facets(facet.StarredFacet(True)) self.assertEqual(len(response.facets[0].choices), 2) # true & false self.assertEqual(response.facets[0].choices[True].count, 3) self.project.remove_rows() self.assertInResponse('3 rows') class TutorialTestDuplicateDetection(refinetest.RefineTestCase): project_file = 'duplicates.csv' def test_duplicate_detection(self): # Section "4. Row and Column Editing, # Duplicate Row Detection and Deletion" # {7}, {8} response = self.project.get_rows(sort_by='email') indexes = [row.index for row in response.rows] self.assertEqual(indexes, [4, 9, 8, 3, 0, 2, 5, 6, 1, 7]) # {9} self.project.reorder_rows() self.assertInResponse('Reorder rows') response = self.project.get_rows() indexes = [row.index for row in response.rows] self.assertEqual(indexes, range(10)) # {10} self.project.add_column( 'email', 'count', 'facetCount(value, "value", "email")') self.assertInResponse('column email by filling 10 rows') response = self.project.get_rows() self.assertEqual(self.project.column_order['email'], 0) # i.e. 1st self.assertEqual(self.project.column_order['count'], 1) # i.e. 2nd counts = [row['count'] for row in response.rows] self.assertEqual(counts, [2, 2, 1, 1, 3, 3, 3, 1, 2, 2]) # {11} self.assertFalse(self.project.has_records) self.project.blank_down('email') self.assertInResponse('Blank down 4 cells') self.assertTrue(self.project.has_records) response = self.project.get_rows() emails = [1 if row['email'] else 0 for row in response.rows] self.assertEqual(emails, [1, 0, 1, 1, 1, 0, 0, 1, 1, 0]) # {12} blank_facet = facet.BlankFacet('email', selection=True) # {13} self.project.remove_rows(blank_facet) self.assertInResponse('Remove 4 rows') self.project.engine.remove_all() response = self.project.get_rows() email_counts = [(row['email'], row['count']) for row in response.rows] self.assertEqual(email_counts, [ (u'arthur.duff@example4.com', 2), (u'ben.morisson@example6.org', 1), (u'ben.tyler@example3.org', 1), (u'danny.baron@example1.com', 3), (u'jean.griffith@example5.org', 1), (u'melanie.white@example2.edu', 2) ]) class TutorialTestTransposeColumnsIntoRows(refinetest.RefineTestCase): project_file = 'us_economic_assistance.csv' def test_transpose_columns_into_rows(self): # Section "5. Structural Editing, Transpose Columns into Rows" # {1}, {2}, {3} self.project.transpose_columns_into_rows('FY1946', 64, 'pair') self.assertInResponse('64 column(s) starting with FY1946') # {4} self.project.add_column('pair', 'year', 'value[2,6].toNumber()') self.assertInResponse('filling 26185 rows') # {5} self.project.text_transform( column='pair', expression='value.substring(7).toNumber()') self.assertInResponse('transform on 26185 cells') # {6} self.project.rename_column('pair', 'amount') self.assertInResponse('Rename column pair to amount') # {7} self.project.fill_down('country_name') self.assertInResponse('Fill down 23805 cells') self.project.fill_down('program_name') self.assertInResponse('Fill down 23805 cells') # spot check of last row for transforms and fill down response = self.project.get_rows() row10 = response.rows[9] self.assertEqual(row10['country_name'], 'Afghanistan') self.assertEqual(row10['program_name'], 'Department of Defense Security Assistance') self.assertEqual(row10['amount'], 113777303) class TutorialTestTransposeFixedNumberOfRowsIntoColumns( refinetest.RefineTestCase): project_file = 'fixed-rows.csv' project_format = 'text/line-based' project_options = {'header_lines': 0} def test_transpose_fixed_number_of_rows_into_columns(self): if self.server.version not in ('2.0', '2.1'): self.project.rename_column('Column 1', 'Column') # Section "5. Structural Editing, # Transpose Fixed Number of Rows into Columns" # {1} self.assertTrue('Column' in self.project.column_order) # {8} self.project.transpose_rows_into_columns('Column', 4) self.assertInResponse('Transpose every 4 cells in column Column') # {9} - renaming column triggers a bug in Refine <= 2.1 if self.server.version not in ('2.0', '2.1'): self.project.rename_column('Column 2', 'Address') self.project.rename_column('Column 3', 'Address 2') self.project.rename_column('Column 4', 'Status') # {10} self.project.add_column( 'Column 1', 'Transaction', 'if(value.contains(" sent "), "send", "receive")') self.assertInResponse('Column 1 by filling 4 rows') # {11} transaction_facet = facet.TextFacet(column='Transaction', selection='send') self.project.engine.add_facet(transaction_facet) self.project.compute_facets() # {12}, {13}, {14} self.project.add_column( 'Column 1', 'Sender', 'value.partition(" sent ")[0]') # XXX resetting the facet shows data in rows with Transaction=receive # which shouldn't have been possible with the facet. self.project.add_column( 'Column 1', 'Recipient', 'value.partition(" to ")[2].partition(" on ")[0]') self.project.add_column( 'Column 1', 'Amount', 'value.partition(" sent ")[2].partition(" to ")[0]') # {15} transaction_facet.reset().include('receive') self.project.get_rows() # XXX there seems to be some kind of bug where the model doesn't # match get_rows() output - cellIndex being returned that are # out of range. #self.assertTrue(a_row['Sender'] is None) #self.assertTrue(a_row['Recipient'] is None) #self.assertTrue(a_row['Amount'] is None) # {16} for column, expression in ( ('Sender', 'cells["Column 1"].value.partition(" from ")[2].partition(" on ")[0]'), ('Recipient', 'cells["Column 1"].value.partition(" received ")[0]'), ('Amount', 'cells["Column 1"].value.partition(" received ")[2].partition(" from ")[0]') ): self.project.text_transform(column, expression) self.assertInResponse('2 cells') # {17} transaction_facet.reset() # {18} self.project.text_transform('Column 1', 'value.partition(" on ")[2]') self.assertInResponse('4 cells') # {19} self.project.reorder_columns(['Transaction', 'Amount', 'Sender', 'Recipient']) self.assertInResponse('Reorder columns') class TutorialTestTransposeVariableNumberOfRowsIntoColumns( refinetest.RefineTestCase): project_file = 'variable-rows.csv' project_format = 'text/line-based' project_options = {'header_lines': 0} def test_transpose_variable_number_of_rows_into_columns(self): # {20}, {21} if self.server.version not in ('2.0', '2.1') : self.project.rename_column('Column 1', 'Column') self.project.add_column( 'Column', 'First Line', 'if(value.contains(" on "), value, null)') self.assertInResponse('Column by filling 4 rows') response = self.project.get_rows() first_names = [row['First Line'][0:10] if row['First Line'] else None for row in response.rows] self.assertEqual(first_names, [ 'Tom Dalton', None, None, None, 'Morgan Law', None, None, None, None, 'Eric Batem']) # {22} self.project.move_column('First Line', 0) self.assertInResponse('Move column First Line to position 0') self.assertEqual(self.project.column_order['First Line'], 0) # {23} self.project.engine.mode = 'record-based' response = self.project.get_rows() self.assertEqual(response.mode, 'record-based') self.assertEqual(response.filtered, 4) # {24} self.project.add_column( 'Column', 'Status', 'row.record.cells["Column"].value[-1]') self.assertInResponse('filling 18 rows') # {25} self.project.text_transform( 'Column', 'row.record.cells["Column"].value[1, -1].join("|")') self.assertInResponse('18 cells') # {26} self.project.engine.mode = 'row-based' # {27} blank_facet = facet.BlankFacet('First Line', selection=True) self.project.remove_rows(blank_facet) self.assertInResponse('Remove 14 rows') self.project.engine.remove_all() # {28} self.project.split_column('Column', separator='|') self.assertInResponse('Split 4 cell(s) in column Column') class TutorialTestWebScraping(refinetest.RefineTestCase): project_file = 'eli-lilly.csv' filter_expr_1 = """ forEach( value[2,-2].replace(" ", " ").split("), ("), v, v[0,-1].partition(", '", true).join(":") ).join("|") """ filter_expr_2 = """ filter( value.split("|"), p, p.partition(":")[0].toNumber() == %d )[0].partition(":")[2] """ def test_web_scraping(self): # Section "6. Web Scraping" # {1}, {2} self.project.split_column('key', separator=':') self.assertInResponse('Split 5409 cell(s) in column key') self.project.rename_column('key 1', 'page') self.assertInResponse('Rename column key 1 to page') self.project.rename_column('key 2', 'top') self.assertInResponse('Rename column key 2 to top') self.project.move_column('line', 'end') self.assertInResponse('Move column line to position 2') # {3} self.project.sorting = facet.Sorting([ {'column': 'page', 'valueType': 'number'}, {'column': 'top', 'valueType': 'number'}, ]) self.project.reorder_rows() self.assertInResponse('Reorder rows') first_row = self.project.get_rows(limit=1).rows[0] self.assertEqual(first_row['page'], 1) self.assertEqual(first_row['top'], 24) # {4} filter_facet = facet.TextFilterFacet('line', 'ahman') rows = self.project.get_rows(filter_facet).rows self.assertEqual(len(rows), 1) self.assertEqual(rows[0]['top'], 106) filter_facet.query = 'alvarez' rows = self.project.get_rows().rows self.assertEqual(len(rows), 2) self.assertEqual(rows[-1]['top'], 567) self.project.engine.remove_all() # {5} - tutorial says 'line'; it means 'top' line_facet = facet.NumericFacet('top') line_facet.to = 100 self.project.remove_rows(line_facet) self.assertInResponse('Remove 775 rows') line_facet.From = 570 line_facet.to = 600 self.project.remove_rows(line_facet) self.assertInResponse('Remove 71 rows') line_facet.reset() response = self.project.get_rows() self.assertEqual(response.filtered, 4563) # {6} page_facet = facet.TextFacet('page', 1) # 1 not '1' self.project.engine.add_facet(page_facet) # {7} rows = self.project.get_rows().rows # Look for a row with a name in it by skipping HTML name_row = [row for row in rows if '' not in row['line']][0] self.assertTrue('WELLNESS' in name_row['line']) self.assertEqual(name_row['top'], 161) line_facet.From = 20 line_facet.to = 160 self.project.remove_rows() self.assertInResponse('Remove 9 rows') self.project.engine.remove_all() # {8} self.project.text_transform('line', expression=self.filter_expr_1) self.assertInResponse('Text transform on 4554 cells in column line') # {9} - XXX following is generating Java exceptions #filter_expr = self.filter_expr_2 % 16 #self.project.add_column('line', 'Name', expression=filter_expr) # {10} to the final {19} - nothing new in terms of exercising the API. if __name__ == '__main__': unittest.main()