Linkcheck als manuell ausführbaren Task ergänzt #1
This commit is contained in:
parent
fefde0e176
commit
24b142ae5a
|
@ -96,6 +96,12 @@ Harvesting von OAI-PMH-Schnittstellen und Transformation in METS/MODS für das P
|
||||||
task siegen:default
|
task siegen:default
|
||||||
```
|
```
|
||||||
|
|
||||||
|
* Links einer Datenquelle überprüfen
|
||||||
|
|
||||||
|
```
|
||||||
|
task siegen:linkcheck
|
||||||
|
```
|
||||||
|
|
||||||
* Cache einer Datenquelle löschen
|
* Cache einer Datenquelle löschen
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -649,19 +649,6 @@
|
||||||
"repeatCount": 10,
|
"repeatCount": 10,
|
||||||
"description": "Text transform on cells in column doctype using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')"
|
"description": "Text transform on cells in column doctype using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"op": "core/text-transform",
|
|
||||||
"engineConfig": {
|
|
||||||
"facets": [],
|
|
||||||
"mode": "row-based"
|
|
||||||
},
|
|
||||||
"columnName": "linkcheck",
|
|
||||||
"expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')",
|
|
||||||
"onError": "keep-original",
|
|
||||||
"repeat": false,
|
|
||||||
"repeatCount": 10,
|
|
||||||
"description": "Text transform on cells in column linkcheck using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"op": "core/row-removal",
|
"op": "core/row-removal",
|
||||||
"engineConfig": {
|
"engineConfig": {
|
||||||
|
|
|
@ -1,35 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"op": "core/column-addition",
|
|
||||||
"engineConfig": {
|
|
||||||
"facets": [
|
|
||||||
{
|
|
||||||
"type": "list",
|
|
||||||
"name": "ddb:transfer",
|
|
||||||
"expression": "isBlank(value)",
|
|
||||||
"columnName": "ddb:transfer",
|
|
||||||
"invert": false,
|
|
||||||
"omitBlank": false,
|
|
||||||
"omitError": false,
|
|
||||||
"selection": [
|
|
||||||
{
|
|
||||||
"v": {
|
|
||||||
"v": false,
|
|
||||||
"l": "false"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"selectBlank": false,
|
|
||||||
"selectError": false
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"mode": "row-based"
|
|
||||||
},
|
|
||||||
"baseColumnName": "ddb:transfer",
|
|
||||||
"expression": "jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)",
|
|
||||||
"onError": "set-to-blank",
|
|
||||||
"newColumnName": "linkcheck",
|
|
||||||
"columnInsertIndex": 34,
|
|
||||||
"description": "Create column linkcheck at index 34 based on column ddb:transfer using expression jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)"
|
|
||||||
}
|
|
||||||
]
|
|
|
@ -1,15 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"op": "core/column-addition",
|
|
||||||
"engineConfig": {
|
|
||||||
"facets": [],
|
|
||||||
"mode": "row-based"
|
|
||||||
},
|
|
||||||
"baseColumnName": "url",
|
|
||||||
"expression": "jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)",
|
|
||||||
"onError": "set-to-blank",
|
|
||||||
"newColumnName": "linkcheck",
|
|
||||||
"columnInsertIndex": 3,
|
|
||||||
"description": "Create column linkcheck at index 3 based on column url using expression jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)"
|
|
||||||
}
|
|
||||||
]
|
|
|
@ -58,8 +58,6 @@ tasks:
|
||||||
- $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/dini.json $PROJECT
|
- $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/dini.json $PROJECT
|
||||||
# Visual Library doctype aus dc:type: Wenn thesis:level == thesis.habilitation dann doctype oaHabil
|
# Visual Library doctype aus dc:type: Wenn thesis:level == thesis.habilitation dann doctype oaHabil
|
||||||
- $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/doctype.json $PROJECT
|
- $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/doctype.json $PROJECT
|
||||||
#TODO # Links auf Volltexte prüfen: HTTP status code ermitteln (z.B. 200)
|
|
||||||
#TODO - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/linkcheck.json $PROJECT
|
|
||||||
# Datenstruktur für Templating vorbereiten: Pro Zeile ein Datensatz und leere Zeilen löschen
|
# Datenstruktur für Templating vorbereiten: Pro Zeile ein Datensatz und leere Zeilen löschen
|
||||||
- $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/join.json $PROJECT
|
- $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/join.json $PROJECT
|
||||||
# Export in METS:MODS mit Templating
|
# Export in METS:MODS mit Templating
|
||||||
|
@ -88,7 +86,7 @@ tasks:
|
||||||
# Logdatei von OpenRefine auf Warnungen und Fehlermeldungen prüfen
|
# Logdatei von OpenRefine auf Warnungen und Fehlermeldungen prüfen
|
||||||
- if grep -i 'exception\|error' openrefine.log; then echo 1>&2 "Logdatei $PWD/openrefine.log enthält Warnungen!" && exit 1; fi
|
- if grep -i 'exception\|error' openrefine.log; then echo 1>&2 "Logdatei $PWD/openrefine.log enthält Warnungen!" && exit 1; fi
|
||||||
# Prüfen, ob Mindestanzahl von 1250 Datensätzen generiert wurde
|
# Prüfen, ob Mindestanzahl von 1250 Datensätzen generiert wurde
|
||||||
- if (( 1250 > $(grep -c recordIdentifier wuppertal.txt) )); then echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/wuppertal.txt!" && exit 1; fi
|
- if (( 1250 > $(grep -c recordIdentifier siegen.txt) )); then echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/wuppertal.txt!" && exit 1; fi
|
||||||
|
|
||||||
split:
|
split:
|
||||||
dir: data/siegen/split
|
dir: data/siegen/split
|
||||||
|
@ -143,6 +141,21 @@ tasks:
|
||||||
# Task nicht ausführen, wenn weniger als zwei ZIP-Archive vorhanden
|
# Task nicht ausführen, wenn weniger als zwei ZIP-Archive vorhanden
|
||||||
- test -z $(ls -t *.zip | sed -n 2p)
|
- test -z $(ls -t *.zip | sed -n 2p)
|
||||||
|
|
||||||
|
linkcheck:
|
||||||
|
dir: data/siegen
|
||||||
|
desc: links überprüfen
|
||||||
|
cmds:
|
||||||
|
# Links extrahieren
|
||||||
|
- xmllint --xpath '//@*[local-name(.) = "href"]' split/*.xml | cut -d '"' -f2 > links.txt
|
||||||
|
# http status code aller Links ermitteln
|
||||||
|
- curl --silent --head --write-out "%{http_code} %{url_effective}\n" $(while read line; do echo "-o /dev/null $line"; done < links.txt) > linkcheck.log
|
||||||
|
# Logdatei auf status code != 2XX prüfen
|
||||||
|
- if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi
|
||||||
|
sources:
|
||||||
|
- split/*.xml
|
||||||
|
generates:
|
||||||
|
- linkcheck.log
|
||||||
|
|
||||||
delete:
|
delete:
|
||||||
desc: harvesting cache löschen
|
desc: harvesting cache löschen
|
||||||
cmds:
|
cmds:
|
||||||
|
|
|
@ -145,6 +145,21 @@ tasks:
|
||||||
# Task nicht ausführen, wenn weniger als zwei ZIP-Archive vorhanden
|
# Task nicht ausführen, wenn weniger als zwei ZIP-Archive vorhanden
|
||||||
- test -z $(ls -t *.zip | sed -n 2p)
|
- test -z $(ls -t *.zip | sed -n 2p)
|
||||||
|
|
||||||
|
linkcheck:
|
||||||
|
dir: data/wuppertal
|
||||||
|
desc: links überprüfen
|
||||||
|
cmds:
|
||||||
|
# Links extrahieren
|
||||||
|
- xmllint --xpath '//@*[local-name(.) = "href"]' split/*.xml | cut -d '"' -f2 > links.txt
|
||||||
|
# http status code aller Links ermitteln
|
||||||
|
- curl --silent --head --write-out "%{http_code} %{url_effective}\n" $(while read line; do echo "-o /dev/null $line"; done < links.txt) > linkcheck.log
|
||||||
|
# Logdatei auf status code != 2XX prüfen
|
||||||
|
- if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi
|
||||||
|
sources:
|
||||||
|
- split/*.xml
|
||||||
|
generates:
|
||||||
|
- linkcheck.log
|
||||||
|
|
||||||
delete:
|
delete:
|
||||||
desc: harvesting cache löschen
|
desc: harvesting cache löschen
|
||||||
cmds:
|
cmds:
|
||||||
|
|
Loading…
Reference in New Issue