2011-04-23 06:00:10 +02:00
|
|
|
#!/usr/bin/env python
|
|
|
|
# encoding: utf-8
|
|
|
|
"""
|
|
|
|
test_refine.py
|
|
|
|
|
|
|
|
Created by Paul Makepeace on 2011-04-22.
|
|
|
|
Copyright (c) 2011 Real Programmers. All rights reserved.
|
|
|
|
"""
|
|
|
|
|
|
|
|
import sys
|
|
|
|
import os
|
|
|
|
import unittest
|
|
|
|
from google.refine import REFINE_HOST, REFINE_PORT
|
2011-04-25 07:08:53 +02:00
|
|
|
from google.refine import NumericFacet, TextFacet, StarredFacet, Engine
|
2011-04-23 06:00:10 +02:00
|
|
|
from google.refine import RefineServer, Refine, RefineProject
|
2011-04-25 00:36:19 +02:00
|
|
|
from google.refine import to_camel, from_camel
|
2011-04-23 06:00:10 +02:00
|
|
|
|
2011-04-23 21:33:21 +02:00
|
|
|
PATH_TO_TEST_DATA = os.path.join('google', 'test', 'data')
|
2011-04-24 18:43:11 +02:00
|
|
|
|
2011-04-25 00:36:19 +02:00
|
|
|
|
|
|
|
class CamelTest(unittest.TestCase):
|
|
|
|
def test_to_camel(self):
|
|
|
|
pairs = (
|
|
|
|
('this', 'this'),
|
|
|
|
('this_attr', 'thisAttr'),
|
|
|
|
('From', 'from'),
|
|
|
|
)
|
|
|
|
for attr, camel_attr in pairs:
|
|
|
|
self.assertEqual(to_camel(attr), camel_attr)
|
|
|
|
|
|
|
|
def test_from_camel(self):
|
|
|
|
pairs = (
|
|
|
|
('this', 'this'),
|
|
|
|
('This', 'this'),
|
|
|
|
('thisAttr', 'this_attr'),
|
|
|
|
('ThisAttr', 'this_attr'),
|
|
|
|
('From', 'from'),
|
|
|
|
)
|
|
|
|
for camel_attr, attr in pairs:
|
|
|
|
self.assertEqual(from_camel(camel_attr), attr)
|
|
|
|
|
|
|
|
|
2011-04-23 06:00:10 +02:00
|
|
|
class RefineTestCase(unittest.TestCase):
|
2011-04-23 21:33:21 +02:00
|
|
|
project_file = None
|
|
|
|
project = None
|
2011-04-25 01:13:59 +02:00
|
|
|
# Section "2. Exploration using Facets": {1}, {2}
|
2011-04-23 06:00:10 +02:00
|
|
|
def setUp(self):
|
|
|
|
self.server = RefineServer()
|
|
|
|
self.refine = Refine(self.server)
|
2011-04-23 21:33:21 +02:00
|
|
|
if self.project_file:
|
|
|
|
self.project = self.refine.new_project(
|
|
|
|
os.path.join(PATH_TO_TEST_DATA, self.project_file))
|
2011-04-23 06:00:10 +02:00
|
|
|
|
2011-04-23 21:33:21 +02:00
|
|
|
def tearDown(self):
|
|
|
|
if self.project:
|
|
|
|
self.project.delete()
|
|
|
|
self.project = None
|
2011-04-24 18:43:11 +02:00
|
|
|
|
|
|
|
|
2011-04-23 06:00:10 +02:00
|
|
|
class RefineServerTest(RefineTestCase):
|
|
|
|
def test_init(self):
|
|
|
|
self.assertEqual(self.server.server, 'http://%s:%s' % (REFINE_HOST, REFINE_PORT))
|
|
|
|
server = RefineServer('http://refine.example/')
|
|
|
|
self.assertEqual(server.server, 'http://refine.example')
|
|
|
|
|
|
|
|
def test_list_projects(self):
|
|
|
|
projects = self.refine.list_projects()
|
|
|
|
self.assertTrue(isinstance(projects, dict))
|
|
|
|
|
2011-04-23 18:49:18 +02:00
|
|
|
def test_get_version(self):
|
|
|
|
version_info = self.refine.get_version()
|
|
|
|
for item in ('revision', 'version', 'full_version', 'full_name'):
|
|
|
|
self.assertTrue(item in version_info)
|
|
|
|
|
2011-04-23 06:00:10 +02:00
|
|
|
|
|
|
|
class RefineTest(RefineTestCase):
|
2011-04-23 21:33:21 +02:00
|
|
|
project_file = 'duplicates.csv'
|
|
|
|
|
2011-04-23 06:00:10 +02:00
|
|
|
def test_new_project(self):
|
2011-04-23 18:49:18 +02:00
|
|
|
self.assertTrue(isinstance(self.project, RefineProject))
|
|
|
|
|
|
|
|
def test_get_models(self):
|
|
|
|
self.assertEqual(self.project.key_column, 'email')
|
|
|
|
self.assertTrue('email' in self.project.columns)
|
|
|
|
self.assertEqual(self.project.column_index['name'], 1)
|
2011-04-24 18:43:11 +02:00
|
|
|
|
2011-04-23 18:49:18 +02:00
|
|
|
def test_delete_project(self):
|
|
|
|
self.assertTrue(self.project.delete())
|
2011-04-23 06:00:10 +02:00
|
|
|
|
|
|
|
|
|
|
|
class TutorialTestFacets(RefineTestCase):
|
2011-04-23 21:33:21 +02:00
|
|
|
project_file = 'louisiana-elected-officials.csv'
|
2011-04-23 06:00:10 +02:00
|
|
|
|
2011-04-24 18:43:11 +02:00
|
|
|
def test_get_rows(self):
|
2011-04-25 01:13:59 +02:00
|
|
|
# Section "2. Exploration using Facets": {3}
|
2011-04-24 18:43:11 +02:00
|
|
|
response = self.project.get_rows(limit=10)
|
|
|
|
self.assertEqual(len(response.rows), 10)
|
|
|
|
self.assertEqual(response.limit, 10)
|
|
|
|
self.assertEqual(response.total, 6958)
|
|
|
|
for row in response.rows:
|
|
|
|
self.assertFalse(row.flagged)
|
|
|
|
self.assertFalse(row.starred)
|
|
|
|
|
2011-04-25 01:13:59 +02:00
|
|
|
def test_facet(self):
|
|
|
|
# Section "2. Exploration using Facets": {4}
|
2011-04-24 23:20:53 +02:00
|
|
|
party_code_facet = TextFacet(column='Party Code')
|
2011-04-25 00:36:19 +02:00
|
|
|
response = self.project.compute_facets(party_code_facet)
|
2011-04-24 21:31:06 +02:00
|
|
|
pc = response.facets[0]
|
2011-04-23 06:00:10 +02:00
|
|
|
self.assertEqual(pc.name, 'Party Code')
|
|
|
|
self.assertEqual(pc.choices['D'].count, 3700)
|
|
|
|
self.assertEqual(pc.choices['N'].count, 15)
|
|
|
|
self.assertEqual(pc.blank_choice.count, 1446)
|
2011-04-24 21:07:49 +02:00
|
|
|
# {5}, {6}
|
2011-04-24 21:45:32 +02:00
|
|
|
engine = Engine(party_code_facet)
|
2011-04-24 23:20:53 +02:00
|
|
|
ethnicity_facet = TextFacet(column='Ethnicity')
|
2011-04-24 21:07:49 +02:00
|
|
|
engine.add_facet(ethnicity_facet)
|
2011-04-24 18:43:11 +02:00
|
|
|
self.project.engine = engine
|
2011-04-25 00:36:19 +02:00
|
|
|
response = self.project.compute_facets()
|
2011-04-24 21:31:06 +02:00
|
|
|
e = response.facets[1]
|
2011-04-23 06:00:10 +02:00
|
|
|
self.assertEqual(e.choices['B'].count, 1255)
|
|
|
|
self.assertEqual(e.choices['W'].count, 4469)
|
2011-04-24 21:07:49 +02:00
|
|
|
# {7}
|
|
|
|
ethnicity_facet.include('B')
|
|
|
|
response = self.project.get_rows()
|
|
|
|
self.assertEqual(response.filtered, 1255)
|
|
|
|
indexes = [r.index for r in response.rows]
|
|
|
|
self.assertEqual(indexes, [1, 2, 3, 4, 6, 12, 18, 26, 28, 32])
|
2011-04-24 21:31:06 +02:00
|
|
|
# {8}
|
2011-04-25 00:36:19 +02:00
|
|
|
response = self.project.compute_facets()
|
2011-04-24 21:31:06 +02:00
|
|
|
pc = response.facets[0]
|
|
|
|
self.assertEqual(pc.name, 'Party Code')
|
|
|
|
self.assertEqual(pc.choices['D'].count, 1179)
|
|
|
|
self.assertEqual(pc.choices['R'].count, 11)
|
|
|
|
self.assertEqual(pc.blank_choice.count, 46)
|
2011-04-24 21:45:32 +02:00
|
|
|
# {9}
|
|
|
|
party_code_facet.include('R')
|
2011-04-25 00:36:19 +02:00
|
|
|
response = self.project.compute_facets()
|
2011-04-24 21:45:32 +02:00
|
|
|
e = response.facets[1]
|
|
|
|
self.assertEqual(e.choices['B'].count, 11)
|
|
|
|
# {10}
|
|
|
|
party_code_facet.reset()
|
|
|
|
ethnicity_facet.reset()
|
|
|
|
response = self.project.get_rows()
|
|
|
|
self.assertEqual(response.filtered, 6958)
|
2011-04-24 21:59:32 +02:00
|
|
|
# {11}
|
2011-04-24 23:20:53 +02:00
|
|
|
office_title_facet = TextFacet('Office Title')
|
2011-04-24 21:59:32 +02:00
|
|
|
self.project.engine.add_facet(office_title_facet)
|
2011-04-25 00:36:19 +02:00
|
|
|
response = self.project.compute_facets()
|
2011-04-24 21:59:32 +02:00
|
|
|
self.assertEqual(len(response.facets[2].choices), 76)
|
2011-04-25 00:36:19 +02:00
|
|
|
# {12} - XXX not sure how to interpret bins & baseBins yet
|
|
|
|
office_level_facet = NumericFacet('Office Level')
|
|
|
|
self.project.engine.add_facet(office_level_facet)
|
|
|
|
# {13}
|
|
|
|
office_level_facet.From = 300 # from reserved word
|
|
|
|
office_level_facet.to = 320
|
|
|
|
response = self.project.get_rows()
|
|
|
|
self.assertEqual(response.filtered, 1907)
|
|
|
|
response = self.project.compute_facets()
|
|
|
|
ot = response.facets[2] # Office Title
|
|
|
|
self.assertEqual(len(ot.choices), 21)
|
|
|
|
self.assertEqual(ot.choices['Chief of Police'].count, 2)
|
|
|
|
self.assertEqual(ot.choices['Chief of Police '].count, 211)
|
2011-04-25 00:51:21 +02:00
|
|
|
# {14}
|
|
|
|
self.project.engine.remove_all()
|
|
|
|
response = self.project.get_rows()
|
|
|
|
self.assertEqual(response.filtered, 6958)
|
2011-04-25 01:13:59 +02:00
|
|
|
# {15}
|
|
|
|
phone_facet = TextFacet('Phone', expression='value[0, 3]')
|
|
|
|
self.project.engine.add_facet(phone_facet)
|
|
|
|
response = self.project.compute_facets()
|
|
|
|
p = response.facets[0]
|
|
|
|
self.assertEqual(p.expression, 'value[0, 3]')
|
|
|
|
self.assertEqual(p.choices['318'].count, 2331)
|
|
|
|
# {16}
|
|
|
|
commissioned_date_facet = NumericFacet('Commissioned Date',
|
|
|
|
expression='value.toDate().datePart("year")')
|
|
|
|
self.project.engine.add_facet(commissioned_date_facet)
|
|
|
|
response = self.project.compute_facets()
|
|
|
|
cd = response.facets[1]
|
|
|
|
self.assertEqual(cd.error_count, 959)
|
|
|
|
self.assertEqual(cd.numeric_count, 5999)
|
|
|
|
# {17}
|
|
|
|
office_description_facet = NumericFacet('Office Description',
|
|
|
|
expression=r'value.match(/\D*(\d+)\w\w Rep.*/)[0].toNumber()')
|
|
|
|
self.project.engine.add_facet(office_description_facet)
|
|
|
|
response = self.project.compute_facets()
|
|
|
|
cd = response.facets[2]
|
|
|
|
self.assertEqual(cd.min, 0)
|
|
|
|
self.assertEqual(cd.max, 110)
|
|
|
|
self.assertEqual(cd.numeric_count, 548)
|
2011-04-24 21:59:32 +02:00
|
|
|
|
2011-04-23 06:00:10 +02:00
|
|
|
|
2011-04-25 05:43:45 +02:00
|
|
|
class TutorialTestEditing(RefineTestCase):
|
2011-04-25 02:19:45 +02:00
|
|
|
project_file = 'louisiana-elected-officials.csv'
|
|
|
|
|
2011-04-25 05:43:45 +02:00
|
|
|
def test_editing(self):
|
2011-04-25 02:19:45 +02:00
|
|
|
# Section "3. Cell Editing": {1}
|
|
|
|
self.project.engine.remove_all() # redundant due to setUp
|
|
|
|
# {2}
|
|
|
|
response = self.project.text_transform(column='Zip Code 2',
|
|
|
|
expression='value.toString()[0, 5]')
|
|
|
|
self.assertTrue('6067' in response['historyEntry']['description'])
|
|
|
|
# {3} - XXX history
|
|
|
|
# {4}
|
|
|
|
office_title_facet = TextFacet('Office Title')
|
|
|
|
self.project.engine.add_facet(office_title_facet)
|
|
|
|
response = self.project.compute_facets()
|
|
|
|
self.assertEqual(len(response.facets[0].choices), 76)
|
2011-04-25 05:43:45 +02:00
|
|
|
response = self.project.text_transform('Office Title', 'value.trim()')
|
2011-04-25 02:45:53 +02:00
|
|
|
self.assertTrue('6895' in response['historyEntry']['description'])
|
2011-04-25 02:19:45 +02:00
|
|
|
response = self.project.compute_facets()
|
|
|
|
self.assertEqual(len(response.facets[0].choices), 67)
|
2011-04-25 02:45:53 +02:00
|
|
|
# {5}
|
2011-04-25 05:43:45 +02:00
|
|
|
response = self.project.edit('Office Title',
|
2011-04-25 02:45:53 +02:00
|
|
|
'Councilmen', 'Councilman')
|
|
|
|
self.assertTrue('13' in response['historyEntry']['description'])
|
2011-04-25 05:43:45 +02:00
|
|
|
response = self.project.compute_facets()
|
|
|
|
self.assertEqual(len(response.facets[0].choices), 66)
|
|
|
|
# {6}
|
|
|
|
response = self.project.compute_clusters('Office Title')
|
|
|
|
self.assertTrue(not response)
|
|
|
|
# {7}
|
|
|
|
clusters = self.project.compute_clusters('Office Title', 'knn')
|
|
|
|
self.assertEqual(len(clusters), 7)
|
|
|
|
self.assertEqual(len(clusters[0]), 2)
|
|
|
|
self.assertEqual(clusters[0][0]['value'], 'RSCC Member')
|
|
|
|
self.assertEqual(clusters[0][0]['count'], 233)
|
|
|
|
# Not strictly necessary to repeat 'Council Member' but a test
|
|
|
|
# of mass_edit, and it's also what the front end sends.
|
|
|
|
response = self.project.mass_edit('Office Title', [{
|
|
|
|
'from': ['Council Member', 'Councilmember'],
|
|
|
|
'to': 'Council Member'
|
|
|
|
}])
|
|
|
|
self.assertTrue('372' in response['historyEntry']['description'])
|
|
|
|
response = self.project.compute_facets()
|
|
|
|
self.assertEqual(len(response.facets[0].choices), 65)
|
2011-04-25 02:19:45 +02:00
|
|
|
|
2011-04-25 08:49:19 +02:00
|
|
|
# Section "4. Row and Column Editing, Batched Row Deletion"
|
2011-04-25 07:08:53 +02:00
|
|
|
# Test doesn't strictly follow the tutorial as the "Browse this
|
|
|
|
# cluster" performs a text facet which the server can't complete
|
|
|
|
# as it busts its max facet count. The useful work is done with
|
|
|
|
# get_rows(). Also, we can facet & select in one; the UI can't.
|
|
|
|
# {1}, {2}, {3}, {4}
|
|
|
|
clusters = self.project.compute_clusters('Candidate Name')
|
|
|
|
for cluster in clusters[0:3]: # just do a few
|
|
|
|
for match in cluster:
|
|
|
|
# {2}
|
|
|
|
if match['value'].endswith(', '):
|
|
|
|
response = self.project.get_rows(
|
|
|
|
TextFacet('Candidate Name', match['value']))
|
|
|
|
self.assertEqual(len(response.rows), 1)
|
|
|
|
for row in response.rows:
|
|
|
|
response = self.project.star_row(row)
|
|
|
|
self.assertTrue(str(row.index + 1) in
|
|
|
|
response['historyEntry']['description'])
|
|
|
|
# {5}, {6}, {7}
|
|
|
|
response = self.project.compute_facets(StarredFacet(True))
|
|
|
|
self.assertEqual(len(response.facets[0].choices), 2) # true & false
|
|
|
|
self.assertEqual(response.facets[0].choices[True].count, 3)
|
|
|
|
response = self.project.remove_rows()
|
|
|
|
self.assertTrue('3 rows' in response['historyEntry']['description'])
|
|
|
|
|
2011-04-25 02:19:45 +02:00
|
|
|
|
2011-04-25 08:49:19 +02:00
|
|
|
class TutorialTestDuplicateDetection(RefineTestCase):
|
|
|
|
project_file = 'duplicates.csv'
|
|
|
|
|
|
|
|
def test_duplicate_detection(self):
|
|
|
|
# Section "4. Row and Column Editing,
|
|
|
|
# Duplicate Row Detection and Deletion"
|
|
|
|
# {7}, {8}
|
|
|
|
response = self.project.get_rows(sort_by='email')
|
|
|
|
indexes = [r.index for r in response.rows]
|
|
|
|
self.assertEqual(indexes, [4, 9, 8, 3, 0, 2, 5, 6, 1, 7])
|
|
|
|
# {9}
|
|
|
|
response = self.project.reorder_rows()
|
|
|
|
self.assertEqual('Reorder rows',
|
|
|
|
response['historyEntry']['description'])
|
|
|
|
response = self.project.get_rows(sort_by='email')
|
|
|
|
indexes = [r.index for r in response.rows]
|
|
|
|
self.assertEqual(indexes, range(10))
|
2011-04-25 16:27:28 +02:00
|
|
|
# {10}
|
|
|
|
response = self.project.add_column('email', 'count',
|
|
|
|
'facetCount(value, "value", "email")')
|
|
|
|
self.assertTrue('column email by filling 10 rows' in
|
|
|
|
response['historyEntry']['description'])
|
2011-04-25 16:32:11 +02:00
|
|
|
response = self.project.get_rows()
|
|
|
|
self.assertEqual(self.project.column_order['count'], 1) # i.e. 2nd
|
2011-04-25 17:51:01 +02:00
|
|
|
counts = [r['count'] for r in response.rows]
|
2011-04-25 16:32:11 +02:00
|
|
|
self.assertEqual(counts, [2, 2, 1, 1, 3, 3, 3, 1, 2, 2])
|
|
|
|
|
2011-04-25 08:49:19 +02:00
|
|
|
|
2011-04-23 06:00:10 +02:00
|
|
|
if __name__ == '__main__':
|
|
|
|
unittest.main()
|