-
Notifications
You must be signed in to change notification settings - Fork 3
Bare bones add #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Bare bones add #23
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,10 +1,10 @@ | ||
| #!/usr/bin/env bash | ||
|
|
||
| function critical { ocrd log -n ocrd-import critical "$1"; } | ||
| function error { ocrd log -n ocrd-import error "$1"; } | ||
| function warning { ocrd log -n ocrd-import warning "$1"; } | ||
| function info { ocrd log -n ocrd-import info "$1"; } | ||
| function debug { ocrd log -n ocrd-import debug "$1"; } | ||
| function critical { echo critical "$1"; } | ||
| function error { echo error "$1"; } | ||
| function warning { echo warning "$1"; } | ||
| function info { echo info "$1"; } | ||
| function debug { echo debug "$1"; } | ||
|
|
||
| ((BASH_VERSINFO<4 || BASH_VERSINFO==4 && BASH_VERSINFO[1]<4)) && critical "bash $BASH_VERSION is too old. Please install 4.4 or newer" && exit 2 | ||
|
|
||
|
|
@@ -144,6 +144,7 @@ for file in $(find -L . -type f -not -name mets.xml -not -name "*.log" | sort); | |
| set -e | ||
| trap rollback ERR | ||
| page=p${zeros:0:$((4-${#num}))}$num | ||
| echo "PAGE=$page" | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. better use the log fn
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this was just debugging by @stefanCCS, can go without replacement. |
||
| group=OCR-D-IMG | ||
| file="${file#./}" | ||
| for suffix in "${skip[@]}"; do | ||
|
|
@@ -165,104 +166,107 @@ for file in $(find -L . -type f -not -name mets.xml -not -name "*.log" | sort); | |
| # also, avoid . in IDs, because downstream it will confuse filename suffix detection | ||
| base="${base//[ :.]/_}" | ||
| if ! [[ ${base:0:1} =~ [a-zA-Z] ]]; then | ||
| base=f${base} | ||
| #base=P_${base} | ||
| a=0 # just do something to have a correct syntax for this 'if' | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. non-XS-compliant page names are a problem regardless of
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not sure what the intention was here. @stefanCCS ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| fi | ||
| if ! ((numpageid)); then | ||
| page=$base | ||
| page=P_$base | ||
| #echo "PAGE=$page" | ||
| # file IDs must contain group and page ID, or processors will have to | ||
| # prevent ID clashes by using numeric IDs | ||
| base=${group}_"$base" | ||
| fi | ||
| mimetype=${MIMETYPES[${suffix,,[A-Z]}]} | ||
| #echo "BASE=$base" | ||
| #debug "found file '$file' (base=$base page=$page mimetype=$mimetype)" | ||
| case "$mimetype" in | ||
| ${MIMETYPE_PAGE}) | ||
| # FIXME should really validate this is PAGE-XML (cf. core#353) | ||
| if fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/ "$file" \ | ||
| && fgrep -qw 'PcGts' "$file"; then | ||
| group=OCR-D-SEG-PAGE | ||
| if ! ((numpageid)); then | ||
| base=${base/OCR-D-IMG/$group} | ||
| fi | ||
| elif fgrep -q http://www.loc.gov/standards/alto/ "$file" \ | ||
| && fgrep -qw alto "$file"; then | ||
| group=OCR-D-SEG-ALTO | ||
| if ! ((numpageid)); then | ||
| base=${base/OCR-D-IMG/$group} | ||
| fi | ||
| elif (($ignore)); then | ||
| warning "unknown type of file '$file'" | ||
| exit #continue | ||
| else | ||
| critical "unknown type of file '$file'" | ||
| false | ||
| fi | ||
| ;; | ||
| application/pdf|application/postscript|application/oxps|image/x-*|"") | ||
| case "$suffix" in | ||
| .pdf|.PDF) | ||
| inopts=(-units PixelsPerInch -density $((2*$dpi))) | ||
| outopts=(-background white -alpha remove -alpha off -colorspace Gray -units PixelsPerInch -resample $dpi -density $dpi) | ||
| ;; | ||
| *) | ||
| inopts=() | ||
| outopts=() | ||
| esac | ||
| if (($convert)) && \ | ||
| mkdir -p OCR-D-IMG && \ | ||
| warning "converting '$file' to 'OCR-D-IMG/${base}_*.tif' prior to import" && \ | ||
| convert "${inopts[@]}" "$file" "${outopts[@]}" OCR-D-IMG/"${base}_%04d.tif"; then | ||
| mimetype=image/tiff | ||
| IFS=$'\n' | ||
| files=($(find OCR-D-IMG -name "${base}_[0-9]*.tif" | sort)) | ||
| IFS=$' \t\n' | ||
| info "converted '$file' to 'OCR-D-IMG/${base}_*.tif' (${#files[*]} files)" | ||
| if ((${#files[*]}>1)); then | ||
| for file in "${files[@]}"; do | ||
| file="${file#./}" | ||
| base="${file%.tif}" | ||
| base="${base#OCR-D-IMG/}" | ||
| add_file $group $mimetype ${page}_${base:(-4)} "$base" "$file" | ||
| done | ||
| # there's no danger of clashes with other files here | ||
| exit # continue | ||
| else | ||
| file="${files[0]}" | ||
| file="${file#./}" | ||
| fi | ||
| elif (($ignore)); then | ||
| warning "unknown type of file '$file'" | ||
| exit # continue | ||
| else | ||
| critical "unknown type of file '$file'" | ||
| false | ||
| fi | ||
| ;; | ||
| esac | ||
| IFS=$'\n' | ||
| clashes=($(ocrd workspace find -i "$base" -k local_filename -k mimetype -k pageId)) | ||
| IFS=$' \t\n' | ||
| n=0 | ||
| for clash in "${clashes[@]}"; do | ||
| let n++ || true | ||
| IFS=$'\t' | ||
| fields=($clash) | ||
| IFS=$' \t\n' | ||
| # if image, allow PAGE with matching basename | ||
| # if PAGE, allow image with matching basename | ||
| if if test $group = OCR-D-IMG; then | ||
| test "x${fields[1]}" = x${MIMETYPE_PAGE} | ||
| else [[ "${fields[1]}" =~ image/ ]] | ||
| fi; then | ||
| # use existing pageId | ||
| page=${fields[2]} | ||
| # use new file ID | ||
| base="$(basename "$file")" # (including suffix) | ||
| base="${base// /_}" | ||
| else | ||
| warning "files '$file' ($mimetype) and '${fields[0]}' (${fields[1]}) have the same basename" | ||
| fi | ||
| done | ||
| #case "$mimetype" in | ||
| # ${MIMETYPE_PAGE}) | ||
|
Comment on lines
-265
to
+183
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want to have a version with next to no checks, the right approach would be to just add a switch to circumvent the no-clash check for resulting pages/files. Everything else is already fast (like the PAGE vs ALTO check for .xml) or can be deactivated (like the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this was very broad commenting out. The "offending" call that slows down import is clashes=($(ocrd workspace find -i "$base" -k local_filename -k mimetype -k pageId)) |
||
| # # FIXME should really validate this is PAGE-XML (cf. core#353) | ||
| # if fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/ "$file" \ | ||
| # && fgrep -qw 'PcGts' "$file"; then | ||
| # group=OCR-D-SEG-PAGE | ||
| # if ! ((numpageid)); then | ||
| # base=${base/OCR-D-IMG/$group} | ||
| # fi | ||
| # elif fgrep -q http://www.loc.gov/standards/alto/ "$file" \ | ||
| # && fgrep -qw alto "$file"; then | ||
| # group=OCR-D-SEG-ALTO | ||
| # if ! ((numpageid)); then | ||
| # base=${base/OCR-D-IMG/$group} | ||
| # fi | ||
| # elif (($ignore)); then | ||
| # warning "unknown type of file '$file'" | ||
| # exit #continue | ||
| # else | ||
| # critical "unknown type of file '$file'" | ||
| # false | ||
| # fi | ||
| # ;; | ||
| # application/pdf|application/postscript|application/oxps|image/x-*|"") | ||
| # case "$suffix" in | ||
| # .pdf|.PDF) | ||
| # inopts=(-units PixelsPerInch -density $((2*$dpi))) | ||
| # outopts=(-background white -alpha remove -alpha off -colorspace Gray -units PixelsPerInch -resample $dpi -density $dpi) | ||
| # ;; | ||
| # *) | ||
| # inopts=() | ||
| # outopts=() | ||
| # esac | ||
| # if (($convert)) && \ | ||
| # mkdir -p OCR-D-IMG && \ | ||
| # warning "converting '$file' to 'OCR-D-IMG/${base}_*.tif' prior to import" && \ | ||
| # convert "${inopts[@]}" "$file" "${outopts[@]}" OCR-D-IMG/"${base}_%04d.tif"; then | ||
| # mimetype=image/tiff | ||
| # IFS=$'\n' | ||
| # files=($(find OCR-D-IMG -name "${base}_[0-9]*.tif" | sort)) | ||
| # IFS=$' \t\n' | ||
| # info "converted '$file' to 'OCR-D-IMG/${base}_*.tif' (${#files[*]} files)" | ||
| # if ((${#files[*]}>1)); then | ||
| # for file in "${files[@]}"; do | ||
| # file="${file#./}" | ||
| # base="${file%.tif}" | ||
| # base="${base#OCR-D-IMG/}" | ||
| # add_file $group $mimetype ${page}_${base:(-4)} "$base" "$file" | ||
| # done | ||
| # # there's no danger of clashes with other files here | ||
| # exit # continue | ||
| # else | ||
| # file="${files[0]}" | ||
| # file="${file#./}" | ||
| # fi | ||
| # elif (($ignore)); then | ||
| # warning "unknown type of file '$file'" | ||
| # exit # continue | ||
| # else | ||
| # critical "unknown type of file '$file'" | ||
| # false | ||
| # fi | ||
| # ;; | ||
| #esac | ||
| #IFS=$'\n' | ||
| #clashes=($(ocrd workspace find -i "$base" -k local_filename -k mimetype -k pageId)) | ||
| #IFS=$' \t\n' | ||
| #n=0 | ||
| #for clash in "${clashes[@]}"; do | ||
| # let n++ || true | ||
| # IFS=$'\t' | ||
| # fields=($clash) | ||
| # IFS=$' \t\n' | ||
| # # if image, allow PAGE with matching basename | ||
| # # if PAGE, allow image with matching basename | ||
| # if if test $group = OCR-D-IMG; then | ||
| # test "x${fields[1]}" = x${MIMETYPE_PAGE} | ||
| # else [[ "${fields[1]}" =~ image/ ]] | ||
| # fi; then | ||
| # # use existing pageId | ||
| # page=${fields[2]} | ||
| # # use new file ID | ||
| # base="$(basename "$file")" # (including suffix) | ||
| # base="${base// /_}" | ||
| # else | ||
| # warning "files '$file' ($mimetype) and '${fields[0]}' (${fields[1]}) have the same basename" | ||
| # fi | ||
| #done | ||
| # finally, add the file to the METS | ||
| add_file $group $mimetype $page "$base" "$file" | ||
| )& | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If the OCR-D logging facility is too inefficient, I suggest to use a proper mechanism to override it with an additional switch. For example, by aliasing
ocrd log -n ocrd-importorechoto a commonlogbackend.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this was just a quick hack to reduce the overhead of calling
ocrd log.How do you mean, aliasing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🎉 that is a nice solution and configurable too.