diff --git a/kapitel-3/35-verarbeitung-von-marc21-mit-openrefine.md b/kapitel-3/35-verarbeitung-von-marc21-mit-openrefine.md index c677430..e8764ee 100644 --- a/kapitel-3/35-verarbeitung-von-marc21-mit-openrefine.md +++ b/kapitel-3/35-verarbeitung-von-marc21-mit-openrefine.md @@ -115,7 +115,7 @@ Wenn Sie sich auf Basis der Empfehlung der LoC, der Statistik und Stichproben f 2. Ausgewählte Daten aus Spalte `content` mit der Funktion `add column based on column...` in eine neue "Dublin Core"-Spalte kopieren \(Name der Spalte ist das Dublin-Core-Feld\). 3. Bei Bedarf die Daten in der neuen Spalte mit Transformationen bearbeiten, um z.B. Trennzeichen einzufügen. 4. Zusammengehörige Werte \(z.B. Person und ihre Lebensdaten\) in der neuen Spalte mit der Funktion `join multi-valued cells` zusammenführen. Damit nicht zuviel \(z.B. mehrere Personen\) zusammengeführt werden, muss dabei die Spalte `index` vorne stehen. -5. Abschließend dann noch einmal mit der Funktion `join multi-valued cells` und dem bekannten Trennzeichen `␟` die Daten in einer Zeile pro Datensatz zusammenführen. Hierzu muss dann die Spalte `id` vorne stehen. +5. Abschließend dann noch einmal mit der Funktion `join multi-valued cells` und dem bekannten Trennzeichen `␟` die Daten in einer Zeile pro Datensatz zusammenführen. Hierzu muss dann die Spalte `id` vorne stehen. Um die Performance zu verbessern, kann alternativ auch die Transformation `row.record.cells["Name der Spalte"].value.join("␟")` (zusammen mit einer Facette "by blank" mit Wert `false` auf die Spalte `id`) auf die neuen Spalten angewendet werden. Beispiel für "Autor/in" \(MARC21 `100a,D,d,e` auf Dublin Core `dc:creator`\): @@ -144,7 +144,8 @@ Beispiel für "Autor/in" \(MARC21 `100a,D,d,e` auf Dublin Core `dc:creator`\): * Spalte `creator` / Edit cells / Join multi-valued cells... / Separator: ` ` \(Leerzeichen\) 5. Abschließend die Daten in einer Zeile pro Datensatz zusammenführen * Spalte `id` / Edit column / Move column to beginning - * Spalte `creator` / Edit cells / Join multi-valued cells... / Separator: `␟` \(Unit Separator\) + * Spalte `id` / Facet / Customized facets / Facet by blank... / Wert `false` auswählen + * Spalte `creator` / Edit cells / Transform... / Expression: `row.record.cells["creator"].value.join("␟")` 6. Ergebnis prüfen und ggf. nachbessern * Spalte `creator` / Facet / Text facet * Spalte `creator` / Edit cells / Cluster and edit... / Method: nearest neighbor diff --git a/losungen.md b/losungen.md index 4043894..51de9d4 100644 --- a/losungen.md +++ b/losungen.md @@ -24,7 +24,7 @@ curl "http://oai.swissbib.ch/oai/DB=2.1?verb=ListRecords&metadataPrefix=m21-xml% JSON-Datei mit Transformationsregeln für ein Mapping von MARC21 auf Dublin Core: [openrefine-marc2dc.json](https://raw.githubusercontent.com/felixlohmeier/kurs-bibliotheks-und-archivinformatik/master/openrefine/openrefine-marc2dc.json) -Ergebnis als TSV-Datei: [openrefine/einstein-nebis\_2017-11-02.tsv](https://github.com/felixlohmeier/kurs-bibliotheks-und-archivinformatik/raw/master/openrefine/einstein-nebis_2017-11-02.tsv) +Ergebnis als TSV-Datei: [openrefine/einstein-nebis\_2017-11-02.tsv](https://github.com/felixlohmeier/kurs-bibliotheks-und-archivinformatik/raw/master/openrefine/einstein-nebis_2017-11-02.tsv) (speichern Sie Datei zur Verwendung in Kapitel 4 als `einstein.tsv` im Ordner `Downloads`) Folgende Mappings wurden darin exemplarisch umgesetzt: diff --git a/openrefine/openrefine-marc2dc.json b/openrefine/openrefine-marc2dc.json index 6ac1c34..1d85842 100644 --- a/openrefine/openrefine-marc2dc.json +++ b/openrefine/openrefine-marc2dc.json @@ -245,11 +245,37 @@ "index": 0 }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column creator", + "op": "core/text-transform", + "description": "Text transform on cells in column creator using expression grel:row.record.cells[\"creator\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "creator", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"creator\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition", @@ -381,11 +407,37 @@ "index": 0 }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column title", + "op": "core/text-transform", + "description": "Text transform on cells in column title using expression grel:row.record.cells[\"title\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "title", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"title\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition", @@ -645,11 +697,37 @@ "index": 0 }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column contributor", + "op": "core/text-transform", + "description": "Text transform on cells in column contributor using expression grel:row.record.cells[\"contributor\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "contributor", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"contributor\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition", @@ -704,11 +782,37 @@ "index": 0 }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column language", + "op": "core/text-transform", + "description": "Text transform on cells in column language using expression grel:row.record.cells[\"language\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "language", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"language\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition", @@ -840,11 +944,37 @@ "index": 0 }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column publisher", + "op": "core/text-transform", + "description": "Text transform on cells in column publisher using expression grel:row.record.cells[\"publisher\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "publisher", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"publisher\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition", @@ -899,11 +1029,37 @@ "onError": "set-to-blank" }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column coverage", + "op": "core/text-transform", + "description": "Text transform on cells in column coverage using expression grel:row.record.cells[\"coverage\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "coverage", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"coverage\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition", @@ -958,11 +1114,37 @@ "index": 0 }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column date", + "op": "core/text-transform", + "description": "Text transform on cells in column date using expression grel:row.record.cells[\"date\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "date", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"date\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition", @@ -1221,11 +1403,37 @@ "index": 0 }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column identifier", + "op": "core/text-transform", + "description": "Text transform on cells in column identifier using expression grel:row.record.cells[\"identifier\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "identifier", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"identifier\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition", @@ -1280,11 +1488,37 @@ "index": 0 }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column rights", + "op": "core/text-transform", + "description": "Text transform on cells in column rights using expression grel:row.record.cells[\"rights\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "rights", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"rights\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition", @@ -1358,11 +1592,37 @@ "index": 0 }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column type", + "op": "core/text-transform", + "description": "Text transform on cells in column type using expression grel:row.record.cells[\"type\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "type", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"type\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/text-transform", @@ -1391,7 +1651,7 @@ } ] }, - "columnName": "type", + "columnName": "uniques", "expression": "grel:value.split(\"␟\").uniques().join(\"␟\")", "onError": "keep-original", "repeat": false, @@ -1545,11 +1805,37 @@ "onError": "set-to-blank" }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column description", + "op": "core/text-transform", + "description": "Text transform on cells in column description using expression grel:row.record.cells[\"description\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "description", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"description\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition", @@ -1604,11 +1890,37 @@ "onError": "set-to-blank" }, { - "op": "core/multivalued-cell-join", - "description": "Join multi-valued cells in column extent", + "op": "core/text-transform", + "description": "Text transform on cells in column extent using expression grel:row.record.cells[\"extent\"].value.join(\"␟\")", + "engineConfig": { + "mode": "row-based", + "facets": [ + { + "omitError": false, + "expression": "isBlank(value)", + "selectBlank": false, + "selection": [ + { + "v": { + "v": false, + "l": "false" + } + } + ], + "selectError": false, + "invert": false, + "name": "id", + "omitBlank": false, + "type": "list", + "columnName": "id" + } + ] + }, "columnName": "extent", - "keyColumnName": "id", - "separator": "␟" + "expression": "grel:row.record.cells[\"extent\"].value.join(\"␟\")", + "onError": "keep-original", + "repeat": false, + "repeatCount": 10 }, { "op": "core/column-addition",