Linkcheck als manuell ausführbaren Task ergänzt #1
This commit is contained in:
parent
fefde0e176
commit
24b142ae5a
|
@ -96,6 +96,12 @@ Harvesting von OAI-PMH-Schnittstellen und Transformation in METS/MODS für das P
|
|||
task siegen:default
|
||||
```
|
||||
|
||||
* Links einer Datenquelle überprüfen
|
||||
|
||||
```
|
||||
task siegen:linkcheck
|
||||
```
|
||||
|
||||
* Cache einer Datenquelle löschen
|
||||
|
||||
```
|
||||
|
|
|
@ -649,19 +649,6 @@
|
|||
"repeatCount": 10,
|
||||
"description": "Text transform on cells in column doctype using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')"
|
||||
},
|
||||
{
|
||||
"op": "core/text-transform",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"columnName": "linkcheck",
|
||||
"expression": "grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')",
|
||||
"onError": "keep-original",
|
||||
"repeat": false,
|
||||
"repeatCount": 10,
|
||||
"description": "Text transform on cells in column linkcheck using expression grel:if(isNonBlank(cells['id'].value),row.record.cells[columnName].value.join('␞'),'')"
|
||||
},
|
||||
{
|
||||
"op": "core/row-removal",
|
||||
"engineConfig": {
|
||||
|
|
|
@ -1,35 +0,0 @@
|
|||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"facets": [
|
||||
{
|
||||
"type": "list",
|
||||
"name": "ddb:transfer",
|
||||
"expression": "isBlank(value)",
|
||||
"columnName": "ddb:transfer",
|
||||
"invert": false,
|
||||
"omitBlank": false,
|
||||
"omitError": false,
|
||||
"selection": [
|
||||
{
|
||||
"v": {
|
||||
"v": false,
|
||||
"l": "false"
|
||||
}
|
||||
}
|
||||
],
|
||||
"selectBlank": false,
|
||||
"selectError": false
|
||||
}
|
||||
],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"baseColumnName": "ddb:transfer",
|
||||
"expression": "jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)",
|
||||
"onError": "set-to-blank",
|
||||
"newColumnName": "linkcheck",
|
||||
"columnInsertIndex": 34,
|
||||
"description": "Create column linkcheck at index 34 based on column ddb:transfer using expression jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)"
|
||||
}
|
||||
]
|
|
@ -1,15 +0,0 @@
|
|||
[
|
||||
{
|
||||
"op": "core/column-addition",
|
||||
"engineConfig": {
|
||||
"facets": [],
|
||||
"mode": "row-based"
|
||||
},
|
||||
"baseColumnName": "url",
|
||||
"expression": "jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)",
|
||||
"onError": "set-to-blank",
|
||||
"newColumnName": "linkcheck",
|
||||
"columnInsertIndex": 3,
|
||||
"description": "Create column linkcheck at index 3 based on column url using expression jython:import httplib\nimport urlparse\nstatus = []\nfor x in value.split(u'\\u241e'):\n url = urlparse.urlparse(x)\n conn = httplib.HTTPConnection(url[1])\n conn.request(\"HEAD\", url[2])\n res = conn.getresponse()\n status.append(str(res.status))\nreturn ','.join(status)"
|
||||
}
|
||||
]
|
|
@ -58,8 +58,6 @@ tasks:
|
|||
- $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/dini.json $PROJECT
|
||||
# Visual Library doctype aus dc:type: Wenn thesis:level == thesis.habilitation dann doctype oaHabil
|
||||
- $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/doctype.json $PROJECT
|
||||
#TODO # Links auf Volltexte prüfen: HTTP status code ermitteln (z.B. 200)
|
||||
#TODO - $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/linkcheck.json $PROJECT
|
||||
# Datenstruktur für Templating vorbereiten: Pro Zeile ein Datensatz und leere Zeilen löschen
|
||||
- $OPENREFINE_CLIENT -P $PORT --apply ../../../rules/siegen/join.json $PROJECT
|
||||
# Export in METS:MODS mit Templating
|
||||
|
@ -88,7 +86,7 @@ tasks:
|
|||
# Logdatei von OpenRefine auf Warnungen und Fehlermeldungen prüfen
|
||||
- if grep -i 'exception\|error' openrefine.log; then echo 1>&2 "Logdatei $PWD/openrefine.log enthält Warnungen!" && exit 1; fi
|
||||
# Prüfen, ob Mindestanzahl von 1250 Datensätzen generiert wurde
|
||||
- if (( 1250 > $(grep -c recordIdentifier wuppertal.txt) )); then echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/wuppertal.txt!" && exit 1; fi
|
||||
- if (( 1250 > $(grep -c recordIdentifier siegen.txt) )); then echo 1>&2 "Unerwartet geringe Anzahl an Datensätzen in $PWD/wuppertal.txt!" && exit 1; fi
|
||||
|
||||
split:
|
||||
dir: data/siegen/split
|
||||
|
@ -143,6 +141,21 @@ tasks:
|
|||
# Task nicht ausführen, wenn weniger als zwei ZIP-Archive vorhanden
|
||||
- test -z $(ls -t *.zip | sed -n 2p)
|
||||
|
||||
linkcheck:
|
||||
dir: data/siegen
|
||||
desc: links überprüfen
|
||||
cmds:
|
||||
# Links extrahieren
|
||||
- xmllint --xpath '//@*[local-name(.) = "href"]' split/*.xml | cut -d '"' -f2 > links.txt
|
||||
# http status code aller Links ermitteln
|
||||
- curl --silent --head --write-out "%{http_code} %{url_effective}\n" $(while read line; do echo "-o /dev/null $line"; done < links.txt) > linkcheck.log
|
||||
# Logdatei auf status code != 2XX prüfen
|
||||
- if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- linkcheck.log
|
||||
|
||||
delete:
|
||||
desc: harvesting cache löschen
|
||||
cmds:
|
||||
|
|
|
@ -145,6 +145,21 @@ tasks:
|
|||
# Task nicht ausführen, wenn weniger als zwei ZIP-Archive vorhanden
|
||||
- test -z $(ls -t *.zip | sed -n 2p)
|
||||
|
||||
linkcheck:
|
||||
dir: data/wuppertal
|
||||
desc: links überprüfen
|
||||
cmds:
|
||||
# Links extrahieren
|
||||
- xmllint --xpath '//@*[local-name(.) = "href"]' split/*.xml | cut -d '"' -f2 > links.txt
|
||||
# http status code aller Links ermitteln
|
||||
- curl --silent --head --write-out "%{http_code} %{url_effective}\n" $(while read line; do echo "-o /dev/null $line"; done < links.txt) > linkcheck.log
|
||||
# Logdatei auf status code != 2XX prüfen
|
||||
- if grep '^[^2]' linkcheck.log; then echo 1>&2 "Logdatei $PWD/linkcheck.log enthält problematische status codes!" && exit 1; fi
|
||||
sources:
|
||||
- split/*.xml
|
||||
generates:
|
||||
- linkcheck.log
|
||||
|
||||
delete:
|
||||
desc: harvesting cache löschen
|
||||
cmds:
|
||||
|
|
Loading…
Reference in New Issue