From 4b8be6f1ad26e160cc8d97b9d9f44c23dc2c7b7c Mon Sep 17 00:00:00 2001 From: chie8842 Date: Fri, 21 Jun 2019 17:20:50 +0900 Subject: [PATCH 1/2] handle html file --- Dockerfile | 2 +- bin/build-docker | 2 +- bin/run-check | 2 +- proofreading.sh | 21 +++++++++++++++++++++ src/html_converter.py | 31 +++++++++++++++++++++++++++++++ 5 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 src/html_converter.py diff --git a/Dockerfile b/Dockerfile index a98136b..0641309 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ RUN apk update RUN apk --no-cache add tar wget openjdk8 gcc pkgconfig zeromq zeromq-dev musl-dev # install python package -RUN pip install jupyter +RUN pip install jupyter click html2text # download redpen # RUN wget https://github.com/redpen-cc/redpen/archive/redpen-1.10.2.tar.gz diff --git a/bin/build-docker b/bin/build-docker index 3ae2723..d8b1457 100755 --- a/bin/build-docker +++ b/bin/build-docker @@ -1,3 +1,3 @@ #!/bin/bash -docker build --no-cache -t chie8842/proofreading . +docker build --no-cache -t tfug/proofreading . diff --git a/bin/run-check b/bin/run-check index 502b9a1..fcc6270 100755 --- a/bin/run-check +++ b/bin/run-check @@ -3,5 +3,5 @@ docker run \ -it \ --rm \ -v $(PWD)/..:/usr/local/documents \ -chie8842/tensorflow_docs_proofreading \ +tfug/proofreading \ /bin/ash proofreading/proofreading.sh $@ diff --git a/proofreading.sh b/proofreading.sh index 73d8669..aa9f3cb 100644 --- a/proofreading.sh +++ b/proofreading.sh @@ -17,6 +17,22 @@ function create_markdowns() { done } +# convert one html to markdown and save to output directory +function create_markdown_from_html() { + dir=`dirname ${file}` + output_dir=${dir//site/proofreading\/output} + echo $output_dir + mkdir -p ${output_dir} + python proofreading/src/html_converter.py --input_file ${file} --output_dir ${output_dir} +} + +# find html files, convert to markdown and save to output directory +function create_markdowns_from_html() { + files=`find site/ja -maxdepth 5 -type f |grep .html` + for file in ${files}; do + create_markdown_from_html + done +} function copy_markdown() { files=`find site/${lang} -maxdepth 5 -type f |grep .md` for file in ${files}; do @@ -44,6 +60,10 @@ if [ $# -eq 1 ]; then create_markdown base_filename=${file##*/} redpen --conf proofreading/redpen-conf.xml ${output_dir}/${base_filename%.*}.md + elif [ ${file##*.} = "html" ]; then + create_markdown_from_html + base_filename=${file##*/} + redpen --conf proofreading/redpen-conf.xml ${output_dir}/${base_filename%.*}.md elif [ ${file##*.} = "md" ]; then redpen --conf proofreading/redpen-conf.xml ${file} else @@ -54,6 +74,7 @@ if [ $# -eq 1 ]; then else echo "check all files" create_markdowns + create_markdowns_from_html copy_markdown exec_redpen fi diff --git a/src/html_converter.py b/src/html_converter.py new file mode 100644 index 0000000..61f2515 --- /dev/null +++ b/src/html_converter.py @@ -0,0 +1,31 @@ +import os +import sys +import click +import html2text + +@click.command() +@click.option('--input_file', '-i', default=None) +@click.option('--output_dir', '-o', default=None) +def main(input_file, output_dir): + if input_file is None or output_dir is None: + print('invalid arguments') + sys.exit(1) + else: + try: + with open(input_file, 'r') as f: + html = f.read() + except: + print(f'{input_file} does not exist') + sys.exit(1) + text = html2text.html2text(html) + output = os.path.join(output_dir, os.path.basename(input_file)) + try: + with open(output, 'w') as f: + f.write(text) + except: + print('output path does not exist') + print(f'converted {input_file} to {output_dir}') + sys.exit(0) + +if __name__ == '__main__': + main() From 2399f08f874b3533fef265e47a3cb114bc1677e6 Mon Sep 17 00:00:00 2001 From: Chie Hayashida Date: Mon, 21 Oct 2019 16:21:26 +0900 Subject: [PATCH 2/2] Update src/html_converter.py Co-Authored-By: Shuhei Fujiwara --- src/html_converter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/html_converter.py b/src/html_converter.py index 61f2515..46d4f83 100644 --- a/src/html_converter.py +++ b/src/html_converter.py @@ -18,7 +18,8 @@ def main(input_file, output_dir): print(f'{input_file} does not exist') sys.exit(1) text = html2text.html2text(html) - output = os.path.join(output_dir, os.path.basename(input_file)) + output_file = '{}.md'.format(os.path.basename(input_file).split('.')[0]) + output = os.path.join(output_dir, output_file) try: with open(output, 'w') as f: f.write(text)