This fork extends the command line interface (CLI) and is distributed as a convenient one-file-executable (Windows, Linux, Mac). It is also available via Docker Hub, PyPI and Binder.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

491 lines
21 KiB

  1. #!/usr/bin/env python
  2. """
  3. test_tutorial.py
  4. The tests here are based on David Huynh's Refine tutorial at
  5. http://davidhuynh.net/spaces/nicar2011/tutorial.pdf The tests perform all the
  6. Refine actions given in the tutorial (except the web scraping) and verify the
  7. changes expected to be observed explained in the tutorial.
  8. These tests require a connection to a Refine server either at
  9. http://127.0.0.1:3333/ or by specifying environment variables
  10. OPENREFINE_HOST and OPENREFINE_PORT.
  11. """
  12. # Copyright (c) 2011 Paul Makepeace, Real Programmers. All rights reserved.
  13. import unittest
  14. from google.refine import facet
  15. from tests import refinetest
  16. class TutorialTestFacets(refinetest.RefineTestCase):
  17. project_file = 'louisiana-elected-officials.csv'
  18. project_options = {'guess_cell_value_types': True}
  19. def test_get_rows(self):
  20. # Section "2. Exploration using Facets": {3}
  21. response = self.project.get_rows(limit=10)
  22. self.assertEqual(len(response.rows), 10)
  23. self.assertEqual(response.limit, 10)
  24. self.assertEqual(response.total, 6958)
  25. self.assertEqual(response.filtered, 6958)
  26. for row in response.rows:
  27. self.assertFalse(row.flagged)
  28. self.assertFalse(row.starred)
  29. def test_facet(self):
  30. # Section "2. Exploration using Facets": {4}
  31. party_code_facet = facet.TextFacet(column='Party Code')
  32. response = self.project.compute_facets(party_code_facet)
  33. pc = response.facets[0]
  34. # test look by index same as look up by facet object
  35. self.assertEqual(pc, response.facets[party_code_facet])
  36. self.assertEqual(pc.name, 'Party Code')
  37. self.assertEqual(pc.choices['D'].count, 3700)
  38. self.assertEqual(pc.choices['N'].count, 15)
  39. self.assertEqual(pc.blank_choice.count, 1446)
  40. # {5}, {6}
  41. engine = facet.Engine(party_code_facet)
  42. ethnicity_facet = facet.TextFacet(column='Ethnicity')
  43. engine.add_facet(ethnicity_facet)
  44. self.project.engine = engine
  45. response = self.project.compute_facets()
  46. e = response.facets[ethnicity_facet]
  47. self.assertEqual(e.choices['B'].count, 1255)
  48. self.assertEqual(e.choices['W'].count, 4469)
  49. # {7}
  50. ethnicity_facet.include('B')
  51. response = self.project.get_rows()
  52. self.assertEqual(response.filtered, 1255)
  53. indexes = [row.index for row in response.rows]
  54. self.assertEqual(indexes, [1, 2, 3, 4, 6, 12, 18, 26, 28, 32])
  55. # {8}
  56. response = self.project.compute_facets()
  57. pc = response.facets[party_code_facet]
  58. self.assertEqual(pc.name, 'Party Code')
  59. self.assertEqual(pc.choices['D'].count, 1179)
  60. self.assertEqual(pc.choices['R'].count, 11)
  61. self.assertEqual(pc.blank_choice.count, 46)
  62. # {9}
  63. party_code_facet.include('R')
  64. response = self.project.compute_facets()
  65. e = response.facets[ethnicity_facet]
  66. self.assertEqual(e.choices['B'].count, 11)
  67. # {10}
  68. party_code_facet.reset()
  69. ethnicity_facet.reset()
  70. response = self.project.get_rows()
  71. self.assertEqual(response.filtered, 6958)
  72. # {11}
  73. office_title_facet = facet.TextFacet('Office Title')
  74. self.project.engine.add_facet(office_title_facet)
  75. response = self.project.compute_facets()
  76. self.assertEqual(len(response.facets[2].choices), 76)
  77. # {12} - XXX not sure how to interpret bins & baseBins yet
  78. office_level_facet = facet.NumericFacet('Office Level')
  79. self.project.engine.add_facet(office_level_facet)
  80. # {13}
  81. office_level_facet.From = 300 # from reserved word
  82. office_level_facet.to = 320
  83. response = self.project.get_rows()
  84. self.assertEqual(response.filtered, 1907)
  85. response = self.project.compute_facets()
  86. ot = response.facets[office_title_facet]
  87. self.assertEqual(len(ot.choices), 21)
  88. self.assertEqual(ot.choices['Chief of Police'].count, 2)
  89. self.assertEqual(ot.choices['Chief of Police '].count, 211)
  90. # {14}
  91. self.project.engine.remove_all()
  92. response = self.project.get_rows()
  93. self.assertEqual(response.filtered, 6958)
  94. # {15}
  95. phone_facet = facet.TextFacet('Phone', expression='value[0, 3]')
  96. self.project.engine.add_facet(phone_facet)
  97. response = self.project.compute_facets()
  98. p = response.facets[phone_facet]
  99. self.assertEqual(p.expression, 'value[0, 3]')
  100. self.assertEqual(p.choices['318'].count, 2331)
  101. # {16}
  102. commissioned_date_facet = facet.NumericFacet(
  103. 'Commissioned Date',
  104. expression='value.toDate().datePart("year")')
  105. self.project.engine.add_facet(commissioned_date_facet)
  106. response = self.project.compute_facets()
  107. cd = response.facets[commissioned_date_facet]
  108. self.assertEqual(cd.error_count, 959)
  109. self.assertEqual(cd.numeric_count, 5999)
  110. # {17}
  111. office_description_facet = facet.NumericFacet(
  112. 'Office Description',
  113. expression=r'value.match(/\D*(\d+)\w\w Rep.*/)[0].toNumber()')
  114. self.project.engine.add_facet(office_description_facet)
  115. response = self.project.compute_facets()
  116. od = response.facets[office_description_facet]
  117. self.assertEqual(od.min, 0)
  118. self.assertEqual(od.max, 110)
  119. self.assertEqual(od.numeric_count, 548)
  120. class TutorialTestEditing(refinetest.RefineTestCase):
  121. project_file = 'louisiana-elected-officials.csv'
  122. project_options = {'guess_cell_value_types': True}
  123. def test_editing(self):
  124. # Section "3. Cell Editing": {1}
  125. self.project.engine.remove_all() # redundant due to setUp
  126. # {2}
  127. self.project.text_transform(column='Zip Code 2',
  128. expression='value.toString()[0, 5]')
  129. self.assertInResponse('transform on 6067 cells in column Zip Code 2')
  130. # {3} - XXX history
  131. # {4}
  132. office_title_facet = facet.TextFacet('Office Title')
  133. self.project.engine.add_facet(office_title_facet)
  134. response = self.project.compute_facets()
  135. self.assertEqual(len(response.facets[office_title_facet].choices), 76)
  136. self.project.text_transform('Office Title', 'value.trim()')
  137. self.assertInResponse('6895')
  138. response = self.project.compute_facets()
  139. self.assertEqual(len(response.facets[office_title_facet].choices), 67)
  140. # {5}
  141. self.project.edit('Office Title', 'Councilmen', 'Councilman')
  142. self.assertInResponse('13')
  143. response = self.project.compute_facets()
  144. self.assertEqual(len(response.facets[office_title_facet].choices), 66)
  145. # {6}
  146. response = self.project.compute_clusters('Office Title')
  147. self.assertTrue(not response)
  148. # {7}
  149. clusters = self.project.compute_clusters('Office Title', 'knn')
  150. self.assertEqual(len(clusters), 7)
  151. first_cluster = clusters[0]
  152. self.assertEqual(len(first_cluster), 2)
  153. self.assertEqual(first_cluster[0]['value'], 'RSCC Member')
  154. self.assertEqual(first_cluster[0]['count'], 233)
  155. # Not strictly necessary to repeat 'Council Member' but a test
  156. # of mass_edit, and it's also what the front end sends.
  157. self.project.mass_edit('Office Title', [{
  158. 'from': ['Council Member', 'Councilmember'],
  159. 'to': 'Council Member'
  160. }])
  161. self.assertInResponse('372')
  162. response = self.project.compute_facets()
  163. self.assertEqual(len(response.facets[office_title_facet].choices), 65)
  164. # Section "4. Row and Column Editing, Batched Row Deletion"
  165. # Test doesn't strictly follow the tutorial as the "Browse this
  166. # cluster" performs a text facet which the server can't complete
  167. # as it busts its max facet count. The useful work is done with
  168. # get_rows(). Also, we can facet & select in one; the UI can't.
  169. # {1}, {2}, {3}, {4}
  170. clusters = self.project.compute_clusters('Candidate Name')
  171. for cluster in clusters[0:3]: # just do a few
  172. for match in cluster:
  173. # {2}
  174. if match['value'].endswith(', '):
  175. response = self.project.get_rows(
  176. facet.TextFacet('Candidate Name', match['value']))
  177. self.assertEqual(len(response.rows), 1)
  178. for row in response.rows:
  179. self.project.star_row(row)
  180. self.assertInResponse(str(row.index + 1))
  181. # {5}, {6}, {7}
  182. response = self.project.compute_facets(facet.StarredFacet(True))
  183. self.assertEqual(len(response.facets[0].choices), 2) # true & false
  184. self.assertEqual(response.facets[0].choices[True].count, 3)
  185. self.project.remove_rows()
  186. self.assertInResponse('3 rows')
  187. class TutorialTestDuplicateDetection(refinetest.RefineTestCase):
  188. project_file = 'duplicates.csv'
  189. def test_duplicate_detection(self):
  190. # Section "4. Row and Column Editing,
  191. # Duplicate Row Detection and Deletion"
  192. # {7}, {8}
  193. response = self.project.get_rows(sort_by='email')
  194. indexes = [row.index for row in response.rows]
  195. self.assertEqual(indexes, [4, 9, 8, 3, 0, 2, 5, 6, 1, 7])
  196. # {9}
  197. self.project.reorder_rows()
  198. self.assertInResponse('Reorder rows')
  199. response = self.project.get_rows()
  200. indexes = [row.index for row in response.rows]
  201. self.assertEqual(indexes, range(10))
  202. # {10}
  203. self.project.add_column(
  204. 'email', 'count', 'facetCount(value, "value", "email")')
  205. self.assertInResponse('column email by filling 10 rows')
  206. response = self.project.get_rows()
  207. self.assertEqual(self.project.column_order['email'], 0) # i.e. 1st
  208. self.assertEqual(self.project.column_order['count'], 1) # i.e. 2nd
  209. counts = [row['count'] for row in response.rows]
  210. self.assertEqual(counts, [2, 2, 1, 1, 3, 3, 3, 1, 2, 2])
  211. # {11}
  212. self.assertFalse(self.project.has_records)
  213. self.project.blank_down('email')
  214. self.assertInResponse('Blank down 4 cells')
  215. self.assertTrue(self.project.has_records)
  216. response = self.project.get_rows()
  217. emails = [1 if row['email'] else 0 for row in response.rows]
  218. self.assertEqual(emails, [1, 0, 1, 1, 1, 0, 0, 1, 1, 0])
  219. # {12}
  220. blank_facet = facet.BlankFacet('email', selection=True)
  221. # {13}
  222. self.project.remove_rows(blank_facet)
  223. self.assertInResponse('Remove 4 rows')
  224. self.project.engine.remove_all()
  225. response = self.project.get_rows()
  226. email_counts = [(row['email'], row['count']) for row in response.rows]
  227. self.assertEqual(email_counts, [
  228. (u'arthur.duff@example4.com', 2),
  229. (u'ben.morisson@example6.org', 1),
  230. (u'ben.tyler@example3.org', 1),
  231. (u'danny.baron@example1.com', 3),
  232. (u'jean.griffith@example5.org', 1),
  233. (u'melanie.white@example2.edu', 2)
  234. ])
  235. class TutorialTestTransposeColumnsIntoRows(refinetest.RefineTestCase):
  236. project_file = 'us_economic_assistance.csv'
  237. def test_transpose_columns_into_rows(self):
  238. # Section "5. Structural Editing, Transpose Columns into Rows"
  239. # {1}, {2}, {3}
  240. self.project.transpose_columns_into_rows('FY1946', 64, 'pair')
  241. self.assertInResponse('64 column(s) starting with FY1946')
  242. # {4}
  243. self.project.add_column('pair', 'year', 'value[2,6].toNumber()')
  244. self.assertInResponse('filling 26185 rows')
  245. # {5}
  246. self.project.text_transform(
  247. column='pair', expression='value.substring(7).toNumber()')
  248. self.assertInResponse('transform on 26185 cells')
  249. # {6}
  250. self.project.rename_column('pair', 'amount')
  251. self.assertInResponse('Rename column pair to amount')
  252. # {7}
  253. self.project.fill_down('country_name')
  254. self.assertInResponse('Fill down 23805 cells')
  255. self.project.fill_down('program_name')
  256. self.assertInResponse('Fill down 23805 cells')
  257. # spot check of last row for transforms and fill down
  258. response = self.project.get_rows()
  259. row10 = response.rows[9]
  260. self.assertEqual(row10['country_name'], 'Afghanistan')
  261. self.assertEqual(row10['program_name'],
  262. 'Department of Defense Security Assistance')
  263. self.assertEqual(row10['amount'], 113777303)
  264. class TutorialTestTransposeFixedNumberOfRowsIntoColumns(
  265. refinetest.RefineTestCase):
  266. project_file = 'fixed-rows.csv'
  267. project_format = 'text/line-based'
  268. project_options = {'header_lines': 0}
  269. def test_transpose_fixed_number_of_rows_into_columns(self):
  270. if self.server.version not in ('2.0', '2.1'):
  271. self.project.rename_column('Column 1', 'Column')
  272. # Section "5. Structural Editing,
  273. # Transpose Fixed Number of Rows into Columns"
  274. # {1}
  275. self.assertTrue('Column' in self.project.column_order)
  276. # {8}
  277. self.project.transpose_rows_into_columns('Column', 4)
  278. self.assertInResponse('Transpose every 4 cells in column Column')
  279. # {9} - renaming column triggers a bug in Refine <= 2.1
  280. if self.server.version not in ('2.0', '2.1'):
  281. self.project.rename_column('Column 2', 'Address')
  282. self.project.rename_column('Column 3', 'Address 2')
  283. self.project.rename_column('Column 4', 'Status')
  284. # {10}
  285. self.project.add_column(
  286. 'Column 1', 'Transaction',
  287. 'if(value.contains(" sent "), "send", "receive")')
  288. self.assertInResponse('Column 1 by filling 4 rows')
  289. # {11}
  290. transaction_facet = facet.TextFacet(column='Transaction',
  291. selection='send')
  292. self.project.engine.add_facet(transaction_facet)
  293. self.project.compute_facets()
  294. # {12}, {13}, {14}
  295. self.project.add_column(
  296. 'Column 1', 'Sender',
  297. 'value.partition(" sent ")[0]')
  298. # XXX resetting the facet shows data in rows with Transaction=receive
  299. # which shouldn't have been possible with the facet.
  300. self.project.add_column(
  301. 'Column 1', 'Recipient',
  302. 'value.partition(" to ")[2].partition(" on ")[0]')
  303. self.project.add_column(
  304. 'Column 1', 'Amount',
  305. 'value.partition(" sent ")[2].partition(" to ")[0]')
  306. # {15}
  307. transaction_facet.reset().include('receive')
  308. self.project.get_rows()
  309. # XXX there seems to be some kind of bug where the model doesn't
  310. # match get_rows() output - cellIndex being returned that are
  311. # out of range.
  312. #self.assertTrue(a_row['Sender'] is None)
  313. #self.assertTrue(a_row['Recipient'] is None)
  314. #self.assertTrue(a_row['Amount'] is None)
  315. # {16}
  316. for column, expression in (
  317. ('Sender',
  318. 'cells["Column 1"].value.partition(" from ")[2].partition(" on ")[0]'),
  319. ('Recipient',
  320. 'cells["Column 1"].value.partition(" received ")[0]'),
  321. ('Amount',
  322. 'cells["Column 1"].value.partition(" received ")[2].partition(" from ")[0]')
  323. ):
  324. self.project.text_transform(column, expression)
  325. self.assertInResponse('2 cells')
  326. # {17}
  327. transaction_facet.reset()
  328. # {18}
  329. self.project.text_transform('Column 1', 'value.partition(" on ")[2]')
  330. self.assertInResponse('4 cells')
  331. # {19}
  332. self.project.reorder_columns(['Transaction', 'Amount', 'Sender',
  333. 'Recipient'])
  334. self.assertInResponse('Reorder columns')
  335. class TutorialTestTransposeVariableNumberOfRowsIntoColumns(
  336. refinetest.RefineTestCase):
  337. project_file = 'variable-rows.csv'
  338. project_format = 'text/line-based'
  339. project_options = {'header_lines': 0}
  340. def test_transpose_variable_number_of_rows_into_columns(self):
  341. # {20}, {21}
  342. if self.server.version not in ('2.0', '2.1') :
  343. self.project.rename_column('Column 1', 'Column')
  344. self.project.add_column(
  345. 'Column', 'First Line', 'if(value.contains(" on "), value, null)')
  346. self.assertInResponse('Column by filling 4 rows')
  347. response = self.project.get_rows()
  348. first_names = [row['First Line'][0:10] if row['First Line'] else None
  349. for row in response.rows]
  350. self.assertEqual(first_names, [
  351. 'Tom Dalton', None, None, None,
  352. 'Morgan Law', None, None, None, None, 'Eric Batem'])
  353. # {22}
  354. self.project.move_column('First Line', 0)
  355. self.assertInResponse('Move column First Line to position 0')
  356. self.assertEqual(self.project.column_order['First Line'], 0)
  357. # {23}
  358. self.project.engine.mode = 'record-based'
  359. response = self.project.get_rows()
  360. self.assertEqual(response.mode, 'record-based')
  361. self.assertEqual(response.filtered, 4)
  362. # {24}
  363. self.project.add_column(
  364. 'Column', 'Status', 'row.record.cells["Column"].value[-1]')
  365. self.assertInResponse('filling 18 rows')
  366. # {25}
  367. self.project.text_transform(
  368. 'Column', 'row.record.cells["Column"].value[1, -1].join("|")')
  369. self.assertInResponse('18 cells')
  370. # {26}
  371. self.project.engine.mode = 'row-based'
  372. # {27}
  373. blank_facet = facet.BlankFacet('First Line', selection=True)
  374. self.project.remove_rows(blank_facet)
  375. self.assertInResponse('Remove 14 rows')
  376. self.project.engine.remove_all()
  377. # {28}
  378. self.project.split_column('Column', separator='|')
  379. self.assertInResponse('Split 4 cell(s) in column Column')
  380. class TutorialTestWebScraping(refinetest.RefineTestCase):
  381. project_file = 'eli-lilly.csv'
  382. filter_expr_1 = """
  383. forEach(
  384. value[2,-2].replace("&#160;", " ").split("), ("),
  385. v,
  386. v[0,-1].partition(", '", true).join(":")
  387. ).join("|")
  388. """
  389. filter_expr_2 = """
  390. filter(
  391. value.split("|"), p, p.partition(":")[0].toNumber() == %d
  392. )[0].partition(":")[2]
  393. """
  394. def test_web_scraping(self):
  395. # Section "6. Web Scraping"
  396. # {1}, {2}
  397. self.project.split_column('key', separator=':')
  398. self.assertInResponse('Split 5409 cell(s) in column key')
  399. self.project.rename_column('key 1', 'page')
  400. self.assertInResponse('Rename column key 1 to page')
  401. self.project.rename_column('key 2', 'top')
  402. self.assertInResponse('Rename column key 2 to top')
  403. self.project.move_column('line', 'end')
  404. self.assertInResponse('Move column line to position 2')
  405. # {3}
  406. self.project.sorting = facet.Sorting([
  407. {'column': 'page', 'valueType': 'number'},
  408. {'column': 'top', 'valueType': 'number'},
  409. ])
  410. self.project.reorder_rows()
  411. self.assertInResponse('Reorder rows')
  412. first_row = self.project.get_rows(limit=1).rows[0]
  413. self.assertEqual(first_row['page'], 1)
  414. self.assertEqual(first_row['top'], 24)
  415. # {4}
  416. filter_facet = facet.TextFilterFacet('line', 'ahman')
  417. rows = self.project.get_rows(filter_facet).rows
  418. self.assertEqual(len(rows), 1)
  419. self.assertEqual(rows[0]['top'], 106)
  420. filter_facet.query = 'alvarez'
  421. rows = self.project.get_rows().rows
  422. self.assertEqual(len(rows), 2)
  423. self.assertEqual(rows[-1]['top'], 567)
  424. self.project.engine.remove_all()
  425. # {5} - tutorial says 'line'; it means 'top'
  426. line_facet = facet.NumericFacet('top')
  427. line_facet.to = 100
  428. self.project.remove_rows(line_facet)
  429. self.assertInResponse('Remove 775 rows')
  430. line_facet.From = 570
  431. line_facet.to = 600
  432. self.project.remove_rows(line_facet)
  433. self.assertInResponse('Remove 71 rows')
  434. line_facet.reset()
  435. response = self.project.get_rows()
  436. self.assertEqual(response.filtered, 4563)
  437. # {6}
  438. page_facet = facet.TextFacet('page', 1) # 1 not '1'
  439. self.project.engine.add_facet(page_facet)
  440. # {7}
  441. rows = self.project.get_rows().rows
  442. # Look for a row with a name in it by skipping HTML
  443. name_row = [row for row in rows if '<b>' not in row['line']][0]
  444. self.assertTrue('WELLNESS' in name_row['line'])
  445. self.assertEqual(name_row['top'], 161)
  446. line_facet.From = 20
  447. line_facet.to = 160
  448. self.project.remove_rows()
  449. self.assertInResponse('Remove 9 rows')
  450. self.project.engine.remove_all()
  451. # {8}
  452. self.project.text_transform('line', expression=self.filter_expr_1)
  453. self.assertInResponse('Text transform on 4554 cells in column line')
  454. # {9} - XXX following is generating Java exceptions
  455. #filter_expr = self.filter_expr_2 % 16
  456. #self.project.add_column('line', 'Name', expression=filter_expr)
  457. # {10} to the final {19} - nothing new in terms of exercising the API.
  458. if __name__ == '__main__':
  459. unittest.main()