From 852dfbf75d645a995de703a03b696ac853775f29 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 Jul 2022 15:39:52 +0200 Subject: [PATCH 1/2] ocrd-import: disable everything basically --- ocrd-import | 186 ++++++++++++++++++++++++++-------------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/ocrd-import b/ocrd-import index c555cc3..9aaaf7d 100755 --- a/ocrd-import +++ b/ocrd-import @@ -1,10 +1,10 @@ #!/usr/bin/env bash -function critical { ocrd log -n ocrd-import critical "$1"; } -function error { ocrd log -n ocrd-import error "$1"; } -function warning { ocrd log -n ocrd-import warning "$1"; } -function info { ocrd log -n ocrd-import info "$1"; } -function debug { ocrd log -n ocrd-import debug "$1"; } +function critical { echo critical "$1"; } +function error { echo error "$1"; } +function warning { echo warning "$1"; } +function info { echo info "$1"; } +function debug { echo debug "$1"; } ((BASH_VERSINFO<4 || BASH_VERSINFO==4 && BASH_VERSINFO[1]<4)) && critical "bash $BASH_VERSION is too old. Please install 4.4 or newer" && exit 2 @@ -175,94 +175,94 @@ for file in $(find -L . -type f -not -name mets.xml -not -name "*.log" | sort); fi mimetype=${MIMETYPES[${suffix,,[A-Z]}]} #debug "found file '$file' (base=$base page=$page mimetype=$mimetype)" - case "$mimetype" in - ${MIMETYPE_PAGE}) - # FIXME should really validate this is PAGE-XML (cf. core#353) - if fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/ "$file" \ - && fgrep -qw 'PcGts' "$file"; then - group=OCR-D-SEG-PAGE - if ! ((numpageid)); then - base=${base/OCR-D-IMG/$group} - fi - elif fgrep -q http://www.loc.gov/standards/alto/ "$file" \ - && fgrep -qw alto "$file"; then - group=OCR-D-SEG-ALTO - if ! ((numpageid)); then - base=${base/OCR-D-IMG/$group} - fi - elif (($ignore)); then - warning "unknown type of file '$file'" - exit #continue - else - critical "unknown type of file '$file'" - false - fi - ;; - application/pdf|application/postscript|application/oxps|image/x-*|"") - case "$suffix" in - .pdf|.PDF) - inopts=(-units PixelsPerInch -density $((2*$dpi))) - outopts=(-background white -alpha remove -alpha off -colorspace Gray -units PixelsPerInch -resample $dpi -density $dpi) - ;; - *) - inopts=() - outopts=() - esac - if (($convert)) && \ - mkdir -p OCR-D-IMG && \ - warning "converting '$file' to 'OCR-D-IMG/${base}_*.tif' prior to import" && \ - convert "${inopts[@]}" "$file" "${outopts[@]}" OCR-D-IMG/"${base}_%04d.tif"; then - mimetype=image/tiff - IFS=$'\n' - files=($(find OCR-D-IMG -name "${base}_[0-9]*.tif" | sort)) - IFS=$' \t\n' - info "converted '$file' to 'OCR-D-IMG/${base}_*.tif' (${#files[*]} files)" - if ((${#files[*]}>1)); then - for file in "${files[@]}"; do - file="${file#./}" - base="${file%.tif}" - base="${base#OCR-D-IMG/}" - add_file $group $mimetype ${page}_${base:(-4)} "$base" "$file" - done - # there's no danger of clashes with other files here - exit # continue - else - file="${files[0]}" - file="${file#./}" - fi - elif (($ignore)); then - warning "unknown type of file '$file'" - exit # continue - else - critical "unknown type of file '$file'" - false - fi - ;; - esac - IFS=$'\n' - clashes=($(ocrd workspace find -i "$base" -k local_filename -k mimetype -k pageId)) - IFS=$' \t\n' - n=0 - for clash in "${clashes[@]}"; do - let n++ || true - IFS=$'\t' - fields=($clash) - IFS=$' \t\n' - # if image, allow PAGE with matching basename - # if PAGE, allow image with matching basename - if if test $group = OCR-D-IMG; then - test "x${fields[1]}" = x${MIMETYPE_PAGE} - else [[ "${fields[1]}" =~ image/ ]] - fi; then - # use existing pageId - page=${fields[2]} - # use new file ID - base="$(basename "$file")" # (including suffix) - base="${base// /_}" - else - warning "files '$file' ($mimetype) and '${fields[0]}' (${fields[1]}) have the same basename" - fi - done + #case "$mimetype" in + # ${MIMETYPE_PAGE}) + # # FIXME should really validate this is PAGE-XML (cf. core#353) + # if fgrep -q http://schema.primaresearch.org/PAGE/gts/pagecontent/ "$file" \ + # && fgrep -qw 'PcGts' "$file"; then + # group=OCR-D-SEG-PAGE + # if ! ((numpageid)); then + # base=${base/OCR-D-IMG/$group} + # fi + # elif fgrep -q http://www.loc.gov/standards/alto/ "$file" \ + # && fgrep -qw alto "$file"; then + # group=OCR-D-SEG-ALTO + # if ! ((numpageid)); then + # base=${base/OCR-D-IMG/$group} + # fi + # elif (($ignore)); then + # warning "unknown type of file '$file'" + # exit #continue + # else + # critical "unknown type of file '$file'" + # false + # fi + # ;; + # application/pdf|application/postscript|application/oxps|image/x-*|"") + # case "$suffix" in + # .pdf|.PDF) + # inopts=(-units PixelsPerInch -density $((2*$dpi))) + # outopts=(-background white -alpha remove -alpha off -colorspace Gray -units PixelsPerInch -resample $dpi -density $dpi) + # ;; + # *) + # inopts=() + # outopts=() + # esac + # if (($convert)) && \ + # mkdir -p OCR-D-IMG && \ + # warning "converting '$file' to 'OCR-D-IMG/${base}_*.tif' prior to import" && \ + # convert "${inopts[@]}" "$file" "${outopts[@]}" OCR-D-IMG/"${base}_%04d.tif"; then + # mimetype=image/tiff + # IFS=$'\n' + # files=($(find OCR-D-IMG -name "${base}_[0-9]*.tif" | sort)) + # IFS=$' \t\n' + # info "converted '$file' to 'OCR-D-IMG/${base}_*.tif' (${#files[*]} files)" + # if ((${#files[*]}>1)); then + # for file in "${files[@]}"; do + # file="${file#./}" + # base="${file%.tif}" + # base="${base#OCR-D-IMG/}" + # add_file $group $mimetype ${page}_${base:(-4)} "$base" "$file" + # done + # # there's no danger of clashes with other files here + # exit # continue + # else + # file="${files[0]}" + # file="${file#./}" + # fi + # elif (($ignore)); then + # warning "unknown type of file '$file'" + # exit # continue + # else + # critical "unknown type of file '$file'" + # false + # fi + # ;; + #esac + #IFS=$'\n' + #clashes=($(ocrd workspace find -i "$base" -k local_filename -k mimetype -k pageId)) + #IFS=$' \t\n' + #n=0 + #for clash in "${clashes[@]}"; do + # let n++ || true + # IFS=$'\t' + # fields=($clash) + # IFS=$' \t\n' + # # if image, allow PAGE with matching basename + # # if PAGE, allow image with matching basename + # if if test $group = OCR-D-IMG; then + # test "x${fields[1]}" = x${MIMETYPE_PAGE} + # else [[ "${fields[1]}" =~ image/ ]] + # fi; then + # # use existing pageId + # page=${fields[2]} + # # use new file ID + # base="$(basename "$file")" # (including suffix) + # base="${base// /_}" + # else + # warning "files '$file' ($mimetype) and '${fields[0]}' (${fields[1]}) have the same basename" + # fi + #done # finally, add the file to the METS add_file $group $mimetype $page "$base" "$file" )& From 9bbbb873b567cbe2b768dabe6bbfe832e7739de6 Mon Sep 17 00:00:00 2001 From: Stefan von der Heide Date: Fri, 5 Aug 2022 17:14:32 +0200 Subject: [PATCH 2/2] adapt to ensure `make_file_id` comptaible pageID --- ocrd-import | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/ocrd-import b/ocrd-import index 9aaaf7d..7db1fa0 100755 --- a/ocrd-import +++ b/ocrd-import @@ -144,6 +144,7 @@ for file in $(find -L . -type f -not -name mets.xml -not -name "*.log" | sort); set -e trap rollback ERR page=p${zeros:0:$((4-${#num}))}$num + echo "PAGE=$page" group=OCR-D-IMG file="${file#./}" for suffix in "${skip[@]}"; do @@ -165,15 +166,18 @@ for file in $(find -L . -type f -not -name mets.xml -not -name "*.log" | sort); # also, avoid . in IDs, because downstream it will confuse filename suffix detection base="${base//[ :.]/_}" if ! [[ ${base:0:1} =~ [a-zA-Z] ]]; then - base=f${base} + #base=P_${base} + a=0 # just do something to have a correct syntax for this 'if' fi if ! ((numpageid)); then - page=$base + page=P_$base + #echo "PAGE=$page" # file IDs must contain group and page ID, or processors will have to # prevent ID clashes by using numeric IDs base=${group}_"$base" fi mimetype=${MIMETYPES[${suffix,,[A-Z]}]} + #echo "BASE=$base" #debug "found file '$file' (base=$base page=$page mimetype=$mimetype)" #case "$mimetype" in # ${MIMETYPE_PAGE})