diff --git a/example/rcnn/.gitignore b/example/rcnn/.gitignore deleted file mode 100644 index 0c2dc3015e8e..000000000000 --- a/example/rcnn/.gitignore +++ /dev/null @@ -1,82 +0,0 @@ -# IntelliJ project files -.idea -*.iml -out -gen - -### Vim template -[._]*.s[a-w][a-z] -[._]s[a-w][a-z] -*.un~ -Session.vim -.netrwhist -*~ - -### IPythonNotebook template -# Temporary data -.ipynb_checkpoints/ - -### Python template -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*,cover - -# Translations -*.mo -*.pot - -# Django stuff: -*.log - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -*.ipynb -*.params -*.json -.vscode/ \ No newline at end of file diff --git a/example/rcnn/Makefile b/example/rcnn/Makefile deleted file mode 100644 index 66a3ed047a49..000000000000 --- a/example/rcnn/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -all: - cd rcnn/cython/; python setup.py build_ext --inplace; rm -rf build; cd ../../ - cd rcnn/pycocotools/; python setup.py build_ext --inplace; rm -rf build; cd ../../ -clean: - cd rcnn/cython/; rm *.so *.c *.cpp; cd ../../ - cd rcnn/pycocotools/; rm *.so; cd ../../ diff --git a/example/rcnn/README.md b/example/rcnn/README.md index dbf2a423c36d..ab3c8fb88c39 100644 --- a/example/rcnn/README.md +++ b/example/rcnn/README.md @@ -1,185 +1,59 @@ -# Faster R-CNN in MXNet with distributed implementation and data parallelization - -![example detections](https://cloud.githubusercontent.com/assets/13162287/22101032/92085dc0-de6c-11e6-9228-67e72606ddbc.png) - -## Why? -There exist good implementations of Faster R-CNN yet they lack support for recent -ConvNet architectures. The aim of reproducing it from scratch is to fully utilize -MXNet engines and parallelization for object detection. - -| Indicator | py-faster-rcnn (caffe resp.) | mx-rcnn (this reproduction) | -| :-------- | :--------------------------- | :-------------------------- | -| Speed [1] | 2.5 img/s training, 5 img/s testing | 3.8 img/s in training, 12.5 img/s testing | -| Performance [2] | mAP 73.2 | mAP 75.97 | -| Efficiency [3] | 11G for Fast R-CNN | 4.6G for Fast R-CNN | -| Parallelization [4] | None | 3.8 img/s to 6 img/s for 2 GPUs | -| Extensibility [5] | Old framework and base networks | ResNet | - -[1] On Ubuntu 14.04.5 with device Titan X, cuDNN enabled. - The experiment is VGG-16 end-to-end training. -[2] VGG network. Trained end-to-end on VOC07trainval+12trainval, tested on VOC07 test. -[3] VGG network. Fast R-CNN is the most memory expensive process. -[4] VGG network (parallelization limited by bandwidth). - ResNet-101 speeds up from 2 img/s to 3.5 img/s. -[5] py-faster-rcnn does not support ResNet or recent caffe version. - -## Why Not? -* If you value stability and reproducibility over performance and efficiency, please refer to official implementations. - There is no promise in all cases nor all experiments. -* If you value simplicity. Technical details are *very complicated* in MXNet. - This is by design to attain maximum possible performance instead of patching fixes after fixes. - Performance and parallelization are more than a change of parameter. -* If you want to do CPU training, be advised that it has not been verified properly yet. You can change the `ctx` variable in `train_end2end.py` or `train_alternate.py` scripts to `mx.cpu` and run these scripts directly to test it. -* If you are on Windows some people reported it was possible with some modifications. But they have disappeared. - -## Experiments -| Method | Network | Training Data | Testing Data | Reference | Result | -| :----- | :------ | :------------ | :----------- | :-------: | :----: | -| Fast R-CNN | VGG16 | VOC07 | VOC07test | 66.9 | 66.50 | -| Faster R-CNN alternate | VGG16 | VOC07 | VOC07test | 69.9 | 69.62 | -| Faster R-CNN end-to-end | VGG16 | VOC07 | VOC07test | 69.9 | 70.23 | -| Faster R-CNN end-to-end | VGG16 | VOC07+12 | VOC07test | 73.2 | 75.97 | -| Faster R-CNN end-to-end | ResNet-101 | VOC07+12 | VOC07test | 76.4 | 79.35 | -| Faster R-CNN end-to-end | VGG16 | COCO train | COCO val | 21.2 | 22.8 | -| Faster R-CNN end-to-end | ResNet-101 | COCO train | COCO val | 27.2 | 26.1 | - -The above experiments were conducted at [mx-rcnn](https://github.com/precedenceguo/mx-rcnn/tree/6a1ab0eec5035a10a1efb5fc8c9d6c54e101b4d0) -using [a MXNet fork, based on MXNet 0.9.1 nnvm pre-release](https://github.com/precedenceguo/mxnet/tree/simple). - -## Quickstart -* Prepare: `bash script/additional_deps.sh` -* Download training data: `bash script/get_voc.sh` -* Download pretrained model: `bash script/get_pretrained_model.sh` -* Training and testing: `bash script/vgg_voc07.sh 0,1` (this means to use gpu 0 and 1) - -## Prerequisites -* Pip, Python-dev, Unzip -* Some python packages are required: Cython, Scikit-image, Easydict, Matplot, OpenCV, Future -* On debian, you can usually run `sudo apt install python-pip python-dev unzip` -* And the python packages can be installed by running `sudo pip install cython scikit-image easydict matplotlib opencv-python future`. Note that you may have to remove sudo depending on how your mxnet package is installed. -* MXNet version v0.9.5 or higher with Python interface installed. Open `python` type `import mxnet` to confirm. - -## Getting started -* Suppose `HOME` represents where this file is located. All commands, unless stated otherwise, should be started from `HOME`. -* Ensure that `bash script/additional_deps.sh` installs all prerequisites listed above. If you're not using this script, ensure above prerequisities are present on your system and then run `make` from `HOME`. This builds the cython extensions and installs python bindings for them. - -Command line arguments have the same meaning as in mxnet/example/image-classification. -* `prefix` refers to the first part of a saved model file name and `epoch` refers to a number in this file name. - In `model/vgg-0000.params`, `prefix` is `"model/vgg"` and `epoch` is `0`. -* `begin_epoch` means the start of your training process, which will apply to all saved checkpoints. -* Remember to turn off cudnn auto tune. `export MXNET_CUDNN_AUTOTUNE_DEFAULT=0`. - -## Demo (Pascal VOC) -* An example of trained model (trained on VOC07 trainval) can be accessed from - [Baidu Yun](http://pan.baidu.com/s/1boRhGvH) (ixiw) or - [Dropbox](https://www.dropbox.com/s/jrr83q0ai2ckltq/final-0000.params.tar.gz?dl=0). - If you put the extracted model `final-0000.params` in `HOME` then use `--prefix final --epoch 0` to access it. -* Try out detection result by running `python demo.py --prefix final --epoch 0 --image myimage.jpg --gpu 0 --vis`. - Drop the `--vis` if you do not have a display or want to save as a new file. - -## Training Faster R-CNN -The following tutorial is based on VOC data, VGG network. Supply `--network resnet` and -`--dataset coco` to use other networks and datasets. -Refer to `script/vgg_voc07.sh` and other experiments for examples. - -### Prepare Training Data -See `bash script/get_voc.sh` and `bash script/get_coco.sh` will do the following for you. -* Make a folder `data` in `HOME`. `data` folder will be used to place the training data folder `VOCdevkit` and `coco`. -* Download and extract [Pascal VOC data](http://host.robots.ox.ac.uk/pascal/VOC/), place the `VOCdevkit` folder in `HOME/data`. -* Download and extract [coco dataset](http://mscoco.org/dataset/), place all images to `coco/images` and annotation jsons to `data/annotations`. - -(Skip this if not interested) All dataset have three attributes, `image_set`, `root_path` and `dataset_path`. -* `image_set` could be `2007_trainval` or something like `2007trainval+2012trainval`. -* `root_path` is usually `data`, where `cache`, `selective_search_data`, `rpn_data` will be stored. -* `dataset_path` could be something like `data/VOCdevkit`, where images, annotations and results can be put so that many copies of datasets can be linked to the same actual place. - -### Prepare Pretrained Models -See if `bash script/get_pretrained_model.sh` will do this for you. If not, -* Make a folder `model` in `HOME`. `model` folder will be used to place model checkpoints along the training process. - It is recommended to set `model` as a symbolic link to somewhere else in hard disk. -* Download VGG16 pretrained model `vgg16-0000.params` from [MXNet model gallery](https://github.com/dmlc/mxnet-model-gallery/blob/master/imagenet-1k-vgg.md) to `model` folder. -* Download ResNet pretrained model `resnet-101-0000.params` from [ResNet](https://github.com/tornadomeet/ResNet) to `model` folder. - -### Alternate Training -See if `bash script/vgg_alter_voc07.sh 0` (use gpu 0) will do the following for you. -* Start training by running `python train_alternate.py`. This will train the VGG network on the VOC07 trainval. - More control of training process can be found in the argparse help. -* Start testing by running `python test.py --prefix model/final --epoch 0` after completing the training process. - This will test the VGG network on the VOC07 test with the model in `HOME/model/final-0000.params`. - Adding a `--vis` will turn on visualization and `-h` will show help as in the training process. - -### End-to-end Training (approximate process) -See if `bash script/vgg_voc07.sh 0` (use gpu 0) will do the following for you. -* Start training by running `python train_end2end.py`. This will train the VGG network on VOC07 trainval. -* Start testing by running `python test.py`. This will test the VGG network on the VOC07 test. - -## Training Fast R-CNN (legacy from the initial version) -See if `bash script/get_selective.sh` and `bash script/vgg_fast_rcnn.sh 0` (use gpu 0) will do the following for you. -* To reproduce Fast R-CNN, `scipy` is used to load selective search proposals. -* Download [precomputed selective search data](https://github.com/rbgirshick/fast-rcnn/tree/master/data) and place them to `data` folder. - `script/get_selective_search.sh` will do this. -* Start training by running `python -m rcnn.tools.train_rcnn --proposal selective_search` to use the selective search proposal. -* Start testing by running `python -m rcnn.tools.test_rcnn --proposal selective_search`. -* `script/vgg_fast_rcnn.sh` will train Fast R-CNN on VOC07 and test on VOC07test. - -## What is Faster R-CNN, anyway? -Region Proposal Network solves object detection as a regression problem -from the objectness perspective. Bounding boxes are predicted by applying -learned bounding box deltas to base boxes, namely anchor boxes across -different positions in feature maps. Training process directly learns a -mapping from raw image intensities to bounding box transformation targets. - -Fast R-CNN treats general object detection as a classification problem and -bounding box prediction as a regression problem. Classifying cropped region -feature maps and predicting bounding box displacements together yields -detection results. Cropping feature maps instead of image input accelerates -computation utilizing shared convolution maps. Bounding box displacements -are simultaneously learned in the training process. - -Faster R-CNN utilize an alternate optimization training process between RPN -and Fast R-CNN. Fast R-CNN weights are used to initiate RPN for training. -The approximate joint training scheme does not backpropagate rcnn training -error to rpn training. - -## Structure -This repository provides Faster R-CNN as a package named `rcnn`. - * `rcnn.core`: core routines in Faster R-CNN training and testing. - * `rcnn.cython`: cython speedup from py-faster-rcnn. - * `rcnn.dataset`: dataset library. Base class is `rcnn.dataset.imdb.IMDB`. - * `rcnn.io`: prepare training data. - * `rcnn.processing`: data and label processing library. - * `rcnn.pycocotools`: python api from coco dataset. - * `rcnn.symbol`: symbol and operator. - * `rcnn.tools`: training and testing wrapper. - * `rcnn.utils`: utilities in training and testing, usually overloads mxnet functions. - -## Disclaimer +# Faster R-CNN in MXNet + +Please redirect any issue or question of using this symbolic example of Faster R-CNN to https://github.com/ijkguo/mx-rcnn. +For a gluon imperative version, checkout https://github.com/dmlc/gluon-cv. + +### Set up environment +* Require latest MXNet. Set environment variable by `export MXNET_CUDNN_AUTOTUNE_DEFAULT=0`. +* Install Python package `mxnet` (cpu inference only) or `mxnet-cu90` (gpu training), `cython` then `opencv-python matplotlib pycocotools tqdm`. + +### Out-of-box inference models +Download any of the following models to the current directory and run `python3 demo.py --dataset $Dataset$ --network $Network$ --params $MODEL_FILE$ --image $YOUR_IMAGE$` to get single image inference. +For example `python3 demo.py --dataset voc --network vgg16 --params vgg16_voc0712.params --image myimage.jpg`, add `--gpu 0` to use GPU optionally. +Different network has different configuration. Different dataset has different object class names. You must pass them explicitly as command line arguments. + +| Network | Dataset | Imageset | Reference | Result | Link | +| :------ | :------------ | :----------- | :-------: | :----: | :---: | +| vgg16 | voc | 07/07 | 69.9 | 70.23 | [Dropbox](https://www.dropbox.com/s/gfxnf1qzzc0lzw2/vgg_voc07-0010.params?dl=0) | +| vgg16 | voc | 07++12/07 | 73.2 | 75.97 | [Dropbox](https://www.dropbox.com/s/rvktx65s48cuyb9/vgg_voc0712-0010.params?dl=0) | +| resnet101 | voc | 07++12/07 | 76.4 | 79.35 | [Dropbox](https://www.dropbox.com/s/ge2wl0tn47xezdf/resnet_voc0712-0010.params?dl=0) | +| vgg16 | coco | train2017/val2017 | 21.2 | 22.8 | [Dropbox](https://www.dropbox.com/s/e0ivvrc4pku3vj7/vgg_coco-0010.params?dl=0) | +| resnet101 | coco | train2017/val2017 | 27.2 | 26.1 | [Dropbox](https://www.dropbox.com/s/bfuy2uo1q1nwqjr/resnet_coco-0010.params?dl=0) | + +### Download data and label +Make a directory `data` and follow `py-faster-rcnn` for data preparation instructions. +* [Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/) should be in `data/VOCdevkit` containing `VOC2007`, `VOC2012` and `annotations`. +* [MSCOCO](http://mscoco.org/dataset/) should be in `data/coco` containing `train2017`, `val2017` and `annotations/instances_train2017.json`, `annotations/instances_val2017.json`. + +### Download pretrained ImageNet models +* [VGG16](http://www.robots.ox.ac.uk/~vgg/research/very_deep/) should be at `model/vgg16-0000.params` from [MXNet model zoo](http://data.dmlc.ml/models/imagenet/vgg/). +* [ResNet](https://github.com/tornadomeet/ResNet) should be at `model/resnet-101-0000.params` from [MXNet model zoo](http://data.dmlc.ml/models/imagenet/resnet/). + +### Training and evaluation +Use `python3 train.py --dataset $Dataset$ --network $Network$ --pretrained $IMAGENET_MODEL_FILE$ --gpus $GPUS$` to train, +for example, `python3 train.py --dataset voc --network vgg16 --pretrained model/vgg16-0000.params --gpus 0,1`. +Use `python3 test.py --dataset $Dataset$ --network $Network$ --params $MODEL_FILE$ --gpu $GPU$` to evaluate, +for example, `python3 test.py --dataset voc --network vgg16 --params model/vgg16-0010.params --gpu 0`. + +### History +* May 25, 2016: We released Fast R-CNN implementation. +* July 6, 2016: We released Faster R-CNN implementation. +* July 23, 2016: We updated to MXNet module solver. +* Oct 10, 2016: tornadomeet released approximate end-to-end training. +* Oct 30, 2016: We updated to MXNet module inference. +* Jan 19, 2017: We accelerated our pipeline and supported ResNet training. +* Jun 22, 2018: We simplified code. + +### Disclaimer This repository used code from [MXNet](https://github.com/dmlc/mxnet), [Fast R-CNN](https://github.com/rbgirshick/fast-rcnn), [Faster R-CNN](https://github.com/rbgirshick/py-faster-rcnn), [caffe](https://github.com/BVLC/caffe), [tornadomeet/mx-rcnn](https://github.com/tornadomeet/mx-rcnn), [MS COCO API](https://github.com/pdollar/coco). -Training data are from -[Pascal VOC](http://host.robots.ox.ac.uk/pascal/VOC/), -[ImageNet](http://image-net.org/), -[COCO](http://mscoco.org/). -Model comes from -[VGG16](http://www.robots.ox.ac.uk/~vgg/research/very_deep/), -[ResNet](https://github.com/tornadomeet/ResNet). Thanks to tornadomeet for end-to-end experiments and MXNet contributers for helpful discussions. -History of this implementation is: -* Fast R-CNN (v1) -* Faster R-CNN (v2) -* Faster R-CNN with module training (v3) -* Faster R-CNN with end-to-end training (v3.5, tornadomeet/mx-rcnn) -* Faster R-CNN with end-to-end training and module testing (v4) -* Faster R-CNN with accelerated training and resnet (v5) - -mxnet/example/rcnn was v1, v2, v3.5 and now v5. - -## References +### References 1. Tianqi Chen, Mu Li, Yutian Li, Min Lin, Naiyan Wang, Minjie Wang, Tianjun Xiao, Bing Xu, Chiyuan Zhang, and Zheng Zhang. MXNet: A Flexible and Efficient Machine Learning Library for Heterogeneous Distributed Systems. In Neural Information Processing Systems, Workshop on Machine Learning Systems, 2015 2. Ross Girshick. "Fast R-CNN." In Proceedings of the IEEE International Conference on Computer Vision, 2015. 3. Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. "Faster R-CNN: Towards real-time object detection with region proposal networks." In IEEE Transactions on Pattern Analysis and Machine Intelligence, 2016. diff --git a/example/rcnn/demo.py b/example/rcnn/demo.py index b59403379ddd..2315bb8af366 100644 --- a/example/rcnn/demo.py +++ b/example/rcnn/demo.py @@ -16,144 +16,205 @@ # under the License. import argparse -import os -import cv2 +import ast +import pprint + import mxnet as mx -import numpy as np -from rcnn.logger import logger -from rcnn.config import config -from rcnn.symbol import get_vgg_test, get_vgg_rpn_test -from rcnn.io.image import resize, transform -from rcnn.core.tester import Predictor, im_detect, im_proposal, vis_all_detection, draw_all_detection -from rcnn.utils.load_model import load_param -from rcnn.processing.nms import py_nms_wrapper, cpu_nms_wrapper, gpu_nms_wrapper - - -CLASSES = ('__background__', - 'aeroplane', 'bicycle', 'bird', 'boat', - 'bottle', 'bus', 'car', 'cat', 'chair', - 'cow', 'diningtable', 'dog', 'horse', - 'motorbike', 'person', 'pottedplant', - 'sheep', 'sofa', 'train', 'tvmonitor') -config.TEST.HAS_RPN = True -SHORT_SIDE = config.SCALES[0][0] -LONG_SIDE = config.SCALES[0][1] -PIXEL_MEANS = config.PIXEL_MEANS -DATA_NAMES = ['data', 'im_info'] -LABEL_NAMES = None -DATA_SHAPES = [('data', (1, 3, LONG_SIDE, SHORT_SIDE)), ('im_info', (1, 3))] -LABEL_SHAPES = None -# visualization -CONF_THRESH = 0.7 -NMS_THRESH = 0.3 -nms = py_nms_wrapper(NMS_THRESH) - - -def get_net(symbol, prefix, epoch, ctx): - arg_params, aux_params = load_param(prefix, epoch, convert=True, ctx=ctx, process=True) - - # infer shape - data_shape_dict = dict(DATA_SHAPES) - arg_names, aux_names = symbol.list_arguments(), symbol.list_auxiliary_states() - arg_shape, _, aux_shape = symbol.infer_shape(**data_shape_dict) - arg_shape_dict = dict(zip(arg_names, arg_shape)) - aux_shape_dict = dict(zip(aux_names, aux_shape)) +from mxnet.module import Module - # check shapes - for k in symbol.list_arguments(): - if k in data_shape_dict or 'label' in k: - continue - assert k in arg_params, k + ' not initialized' - assert arg_params[k].shape == arg_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) - for k in symbol.list_auxiliary_states(): - assert k in aux_params, k + ' not initialized' - assert aux_params[k].shape == aux_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) - - predictor = Predictor(symbol, DATA_NAMES, LABEL_NAMES, context=ctx, - provide_data=DATA_SHAPES, provide_label=LABEL_SHAPES, - arg_params=arg_params, aux_params=aux_params) - return predictor - - -def generate_batch(im): - """ - preprocess image, return batch - :param im: cv2.imread returns [height, width, channel] in BGR - :return: - data_batch: MXNet input batch - data_names: names in data_batch - im_scale: float number - """ - im_array, im_scale = resize(im, SHORT_SIDE, LONG_SIDE) - im_array = transform(im_array, PIXEL_MEANS) - im_info = np.array([[im_array.shape[2], im_array.shape[3], im_scale]], dtype=np.float32) - data = [mx.nd.array(im_array), mx.nd.array(im_info)] - data_shapes = [('data', im_array.shape), ('im_info', im_info.shape)] - data_batch = mx.io.DataBatch(data=data, label=None, provide_data=data_shapes, provide_label=None) - return data_batch, DATA_NAMES, im_scale - - -def demo_net(predictor, image_name, vis=False): - """ - generate data_batch -> im_detect -> post process - :param predictor: Predictor - :param image_name: image name - :param vis: will save as a new image if not visualized - :return: None - """ - assert os.path.exists(image_name), image_name + ' not found' - im = cv2.imread(image_name) - data_batch, data_names, im_scale = generate_batch(im) - scores, boxes, data_dict = im_detect(predictor, data_batch, data_names, im_scale) - - all_boxes = [[] for _ in CLASSES] - for cls in CLASSES: - cls_ind = CLASSES.index(cls) - cls_boxes = boxes[:, 4 * cls_ind:4 * (cls_ind + 1)] - cls_scores = scores[:, cls_ind, np.newaxis] - keep = np.where(cls_scores >= CONF_THRESH)[0] - dets = np.hstack((cls_boxes, cls_scores)).astype(np.float32)[keep, :] - keep = nms(dets) - all_boxes[cls_ind] = dets[keep, :] - - boxes_this_image = [[]] + [all_boxes[j] for j in range(1, len(CLASSES))] - - # print results - logger.info('---class---') - logger.info('[[x1, x2, y1, y2, confidence]]') - for ind, boxes in enumerate(boxes_this_image): - if len(boxes) > 0: - logger.info('---%s---' % CLASSES[ind]) - logger.info('%s' % boxes) - - if vis: - vis_all_detection(data_dict['data'].asnumpy(), boxes_this_image, CLASSES, im_scale) +from symdata.bbox import im_detect +from symdata.loader import load_test, generate_batch +from symdata.vis import vis_detection +from symnet.model import load_param, check_shape + + +def demo_net(sym, class_names, args): + # print config + print('called with args\n{}'.format(pprint.pformat(vars(args)))) + + # setup context + if args.gpu: + ctx = mx.gpu(int(args.gpu)) else: - result_file = image_name.replace('.', '_result.') - logger.info('results saved to %s' % result_file) - im = draw_all_detection(data_dict['data'].asnumpy(), boxes_this_image, CLASSES, im_scale) - cv2.imwrite(result_file, im) + ctx = mx.cpu(0) + + # load single test + im_tensor, im_info, im_orig = load_test(args.image, short=args.img_short_side, max_size=args.img_long_side, + mean=args.img_pixel_means, std=args.img_pixel_stds) + + # generate data batch + data_batch = generate_batch(im_tensor, im_info) + + # load params + arg_params, aux_params = load_param(args.params, ctx=ctx) + + # produce shape max possible + data_names = ['data', 'im_info'] + label_names = None + data_shapes = [('data', (1, 3, args.img_long_side, args.img_long_side)), ('im_info', (1, 3))] + label_shapes = None + + # check shapes + check_shape(sym, data_shapes, arg_params, aux_params) + + # create and bind module + mod = Module(sym, data_names, label_names, context=ctx) + mod.bind(data_shapes, label_shapes, for_training=False) + mod.init_params(arg_params=arg_params, aux_params=aux_params) + + # forward + mod.forward(data_batch) + rois, scores, bbox_deltas = mod.get_outputs() + rois = rois[:, 1:] + scores = scores[0] + bbox_deltas = bbox_deltas[0] + im_info = im_info[0] + + # decode detection + det = im_detect(rois, scores, bbox_deltas, im_info, + bbox_stds=args.rcnn_bbox_stds, nms_thresh=args.rcnn_nms_thresh, + conf_thresh=args.rcnn_conf_thresh) + + # print out + for [cls, conf, x1, y1, x2, y2] in det: + if cls > 0 and conf > args.vis_thresh: + print(class_names[int(cls)], conf, [x1, y1, x2, y2]) + + # if vis + if args.vis: + vis_detection(im_orig, det, class_names, thresh=args.vis_thresh) def parse_args(): - parser = argparse.ArgumentParser(description='Demonstrate a Faster R-CNN network') - parser.add_argument('--image', help='custom image', type=str) - parser.add_argument('--prefix', help='saved model prefix', type=str) - parser.add_argument('--epoch', help='epoch of pretrained model', type=int) - parser.add_argument('--gpu', help='GPU device to use', default=0, type=int) - parser.add_argument('--vis', help='display result', action='store_true') + parser = argparse.ArgumentParser(description='Demonstrate a Faster R-CNN network', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--network', type=str, default='vgg16', help='base network') + parser.add_argument('--params', type=str, default='', help='path to trained model') + parser.add_argument('--dataset', type=str, default='voc', help='training dataset') + parser.add_argument('--image', type=str, default='', help='path to test image') + parser.add_argument('--gpu', type=str, default='', help='gpu device eg. 0') + parser.add_argument('--vis', action='store_true', help='display results') + parser.add_argument('--vis-thresh', type=float, default=0.7, help='threshold display boxes') + # faster rcnn params + parser.add_argument('--img-short-side', type=int, default=600) + parser.add_argument('--img-long-side', type=int, default=1000) + parser.add_argument('--img-pixel-means', type=str, default='(0.0, 0.0, 0.0)') + parser.add_argument('--img-pixel-stds', type=str, default='(1.0, 1.0, 1.0)') + parser.add_argument('--rpn-feat-stride', type=int, default=16) + parser.add_argument('--rpn-anchor-scales', type=str, default='(8, 16, 32)') + parser.add_argument('--rpn-anchor-ratios', type=str, default='(0.5, 1, 2)') + parser.add_argument('--rpn-pre-nms-topk', type=int, default=6000) + parser.add_argument('--rpn-post-nms-topk', type=int, default=300) + parser.add_argument('--rpn-nms-thresh', type=float, default=0.7) + parser.add_argument('--rpn-min-size', type=int, default=16) + parser.add_argument('--rcnn-num-classes', type=int, default=21) + parser.add_argument('--rcnn-feat-stride', type=int, default=16) + parser.add_argument('--rcnn-pooled-size', type=str, default='(14, 14)') + parser.add_argument('--rcnn-batch-size', type=int, default=1) + parser.add_argument('--rcnn-bbox-stds', type=str, default='(0.1, 0.1, 0.2, 0.2)') + parser.add_argument('--rcnn-nms-thresh', type=float, default=0.3) + parser.add_argument('--rcnn-conf-thresh', type=float, default=1e-3) args = parser.parse_args() + args.img_pixel_means = ast.literal_eval(args.img_pixel_means) + args.img_pixel_stds = ast.literal_eval(args.img_pixel_stds) + args.rpn_anchor_scales = ast.literal_eval(args.rpn_anchor_scales) + args.rpn_anchor_ratios = ast.literal_eval(args.rpn_anchor_ratios) + args.rcnn_pooled_size = ast.literal_eval(args.rcnn_pooled_size) + args.rcnn_bbox_stds = ast.literal_eval(args.rcnn_bbox_stds) return args +def get_voc_names(args): + from symimdb.pascal_voc import PascalVOC + args.rcnn_num_classes = len(PascalVOC.classes) + return PascalVOC.classes + + +def get_coco_names(args): + from symimdb.coco import coco + args.rcnn_num_classes = len(coco.classes) + return coco.classes + + +def get_vgg16_test(args): + from symnet.symbol_vgg import get_vgg_test + if not args.params: + args.params = 'model/vgg16-0010.params' + args.img_pixel_means = (123.68, 116.779, 103.939) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.net_fixed_params = ['conv1', 'conv2'] + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (7, 7) + return get_vgg_test(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size) + + +def get_resnet50_test(args): + from symnet.symbol_resnet import get_resnet_test + if not args.params: + args.params = 'model/resnet50-0010.params' + args.img_pixel_means = (0.0, 0.0, 0.0) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (14, 14) + return get_resnet_test(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size, + units=(3, 4, 6, 3), filter_list=(256, 512, 1024, 2048)) + + +def get_resnet101_test(args): + from symnet.symbol_resnet import get_resnet_test + if not args.params: + args.params = 'model/resnet101-0010.params' + args.img_pixel_means = (0.0, 0.0, 0.0) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (14, 14) + return get_resnet_test(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size, + units=(3, 4, 23, 3), filter_list=(256, 512, 1024, 2048)) + +def get_class_names(dataset, args): + datasets = { + 'voc': get_voc_names, + 'coco': get_coco_names + } + if dataset not in datasets: + raise ValueError("dataset {} not supported".format(dataset)) + return datasets[dataset](args) + + +def get_network(network, args): + networks = { + 'vgg16': get_vgg16_test, + 'resnet50': get_resnet50_test, + 'resnet101': get_resnet101_test + } + if network not in networks: + raise ValueError("network {} not supported".format(network)) + return networks[network](args) + + def main(): args = parse_args() - ctx = mx.gpu(args.gpu) - symbol = get_vgg_test(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS) - predictor = get_net(symbol, args.prefix, args.epoch, ctx) - demo_net(predictor, args.image, args.vis) + class_names = get_class_names(args.dataset, args) + sym = get_network(args.network, args) + demo_net(sym, class_names, args) if __name__ == '__main__': diff --git a/example/rcnn/rcnn/config.py b/example/rcnn/rcnn/config.py deleted file mode 100644 index 17738f054b33..000000000000 --- a/example/rcnn/rcnn/config.py +++ /dev/null @@ -1,181 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np -from easydict import EasyDict as edict - -config = edict() - -# network related params -config.PIXEL_MEANS = np.array([103.939, 116.779, 123.68]) -config.IMAGE_STRIDE = 0 -config.RPN_FEAT_STRIDE = 16 -config.RCNN_FEAT_STRIDE = 16 -config.FIXED_PARAMS = ['conv1', 'conv2'] -config.FIXED_PARAMS_SHARED = ['conv1', 'conv2', 'conv3', 'conv4', 'conv5'] - -# dataset related params -config.NUM_CLASSES = 21 -config.SCALES = [(600, 1000)] # first is scale (the shorter side); second is max size -config.ANCHOR_SCALES = (8, 16, 32) -config.ANCHOR_RATIOS = (0.5, 1, 2) -config.NUM_ANCHORS = len(config.ANCHOR_SCALES) * len(config.ANCHOR_RATIOS) - -config.TRAIN = edict() - -# R-CNN and RPN -# size of images for each device, 2 for rcnn, 1 for rpn and e2e -config.TRAIN.BATCH_IMAGES = 2 -# e2e changes behavior of anchor loader and metric -config.TRAIN.END2END = False -# group images with similar aspect ratio -config.TRAIN.ASPECT_GROUPING = True - -# R-CNN -# rcnn rois batch size -config.TRAIN.BATCH_ROIS = 128 -# rcnn rois sampling params -config.TRAIN.FG_FRACTION = 0.25 -config.TRAIN.FG_THRESH = 0.5 -config.TRAIN.BG_THRESH_HI = 0.5 -config.TRAIN.BG_THRESH_LO = 0.0 -# rcnn bounding box regression params -config.TRAIN.BBOX_REGRESSION_THRESH = 0.5 -config.TRAIN.BBOX_WEIGHTS = np.array([1.0, 1.0, 1.0, 1.0]) - -# RPN anchor loader -# rpn anchors batch size -config.TRAIN.RPN_BATCH_SIZE = 256 -# rpn anchors sampling params -config.TRAIN.RPN_FG_FRACTION = 0.5 -config.TRAIN.RPN_POSITIVE_OVERLAP = 0.7 -config.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3 -config.TRAIN.RPN_CLOBBER_POSITIVES = False -# rpn bounding box regression params -config.TRAIN.RPN_BBOX_WEIGHTS = (1.0, 1.0, 1.0, 1.0) -config.TRAIN.RPN_POSITIVE_WEIGHT = -1.0 - -# used for end2end training -# RPN proposal -config.TRAIN.CXX_PROPOSAL = True -config.TRAIN.RPN_NMS_THRESH = 0.7 -config.TRAIN.RPN_PRE_NMS_TOP_N = 12000 -config.TRAIN.RPN_POST_NMS_TOP_N = 2000 -config.TRAIN.RPN_MIN_SIZE = config.RPN_FEAT_STRIDE -# approximate bounding box regression -config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED = False -config.TRAIN.BBOX_MEANS = (0.0, 0.0, 0.0, 0.0) -config.TRAIN.BBOX_STDS = (0.1, 0.1, 0.2, 0.2) - -config.TEST = edict() - -# R-CNN testing -# use rpn to generate proposal -config.TEST.HAS_RPN = False -# size of images for each device -config.TEST.BATCH_IMAGES = 1 - -# RPN proposal -config.TEST.CXX_PROPOSAL = True -config.TEST.RPN_NMS_THRESH = 0.7 -config.TEST.RPN_PRE_NMS_TOP_N = 6000 -config.TEST.RPN_POST_NMS_TOP_N = 300 -config.TEST.RPN_MIN_SIZE = config.RPN_FEAT_STRIDE - -# RPN generate proposal -config.TEST.PROPOSAL_NMS_THRESH = 0.7 -config.TEST.PROPOSAL_PRE_NMS_TOP_N = 20000 -config.TEST.PROPOSAL_POST_NMS_TOP_N = 2000 -config.TEST.PROPOSAL_MIN_SIZE = config.RPN_FEAT_STRIDE - -# RCNN nms -config.TEST.NMS = 0.3 - -# default settings -default = edict() - -# default network -default.network = 'vgg' -default.pretrained = 'model/vgg16' -default.pretrained_epoch = 0 -default.base_lr = 0.001 -# default dataset -default.dataset = 'PascalVOC' -default.image_set = '2007_trainval' -default.test_image_set = '2007_test' -default.root_path = 'data' -default.dataset_path = 'data/VOCdevkit' -# default training -default.frequent = 20 -default.kvstore = 'device' -# default e2e -default.e2e_prefix = 'model/e2e' -default.e2e_epoch = 10 -default.e2e_lr = default.base_lr -default.e2e_lr_step = '7' -# default rpn -default.rpn_prefix = 'model/rpn' -default.rpn_epoch = 8 -default.rpn_lr = default.base_lr -default.rpn_lr_step = '6' -# default rcnn -default.rcnn_prefix = 'model/rcnn' -default.rcnn_epoch = 8 -default.rcnn_lr = default.base_lr -default.rcnn_lr_step = '6' - -# network settings -network = edict() - -network.vgg = edict() - -network.resnet = edict() -network.resnet.pretrained = 'model/resnet-101' -network.resnet.pretrained_epoch = 0 -network.resnet.PIXEL_MEANS = np.array([0, 0, 0]) -network.resnet.IMAGE_STRIDE = 0 -network.resnet.RPN_FEAT_STRIDE = 16 -network.resnet.RCNN_FEAT_STRIDE = 16 -network.resnet.FIXED_PARAMS = ['conv0', 'stage1', 'gamma', 'beta'] -network.resnet.FIXED_PARAMS_SHARED = ['conv0', 'stage1', 'stage2', 'stage3', 'gamma', 'beta'] - -# dataset settings -dataset = edict() - -dataset.PascalVOC = edict() - -dataset.coco = edict() -dataset.coco.dataset = 'coco' -dataset.coco.image_set = 'train2014' -dataset.coco.test_image_set = 'val2014' -dataset.coco.root_path = 'data' -dataset.coco.dataset_path = 'data/coco' -dataset.coco.NUM_CLASSES = 81 - - -def generate_config(_network, _dataset): - for k, v in network[_network].items(): - if k in config: - config[k] = v - elif k in default: - default[k] = v - for k, v in dataset[_dataset].items(): - if k in config: - config[k] = v - elif k in default: - default[k] = v - diff --git a/example/rcnn/rcnn/core/callback.py b/example/rcnn/rcnn/core/callback.py deleted file mode 100644 index 06eb2629e7bd..000000000000 --- a/example/rcnn/rcnn/core/callback.py +++ /dev/null @@ -1,28 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import mxnet as mx - - -def do_checkpoint(prefix, means, stds): - def _callback(iter_no, sym, arg, aux): - arg['bbox_pred_weight_test'] = (arg['bbox_pred_weight'].T * mx.nd.array(stds)).T - arg['bbox_pred_bias_test'] = arg['bbox_pred_bias'] * mx.nd.array(stds) + mx.nd.array(means) - mx.model.save_checkpoint(prefix, iter_no + 1, sym, arg, aux) - arg.pop('bbox_pred_weight_test') - arg.pop('bbox_pred_bias_test') - return _callback diff --git a/example/rcnn/rcnn/core/loader.py b/example/rcnn/rcnn/core/loader.py deleted file mode 100644 index fdd6e5c386f1..000000000000 --- a/example/rcnn/rcnn/core/loader.py +++ /dev/null @@ -1,414 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import mxnet as mx -import numpy as np -from mxnet.executor_manager import _split_input_slice - -from rcnn.config import config -from rcnn.io.image import tensor_vstack -from rcnn.io.rpn import get_rpn_testbatch, get_rpn_batch, assign_anchor -from rcnn.io.rcnn import get_rcnn_testbatch, get_rcnn_batch - - -class TestLoader(mx.io.DataIter): - def __init__(self, roidb, batch_size=1, shuffle=False, - has_rpn=False): - super(TestLoader, self).__init__() - - # save parameters as properties - self.roidb = roidb - self.batch_size = batch_size - self.shuffle = shuffle - self.has_rpn = has_rpn - - # infer properties from roidb - self.size = len(self.roidb) - self.index = np.arange(self.size) - - # decide data and label names (only for training) - if has_rpn: - self.data_name = ['data', 'im_info'] - else: - self.data_name = ['data', 'rois'] - self.label_name = None - - # status variable for synchronization between get_data and get_label - self.cur = 0 - self.data = None - self.label = None - self.im_info = None - - # get first batch to fill in provide_data and provide_label - self.reset() - self.get_batch() - - @property - def provide_data(self): - return [(k, v.shape) for k, v in zip(self.data_name, self.data)] - - @property - def provide_label(self): - return None - - def reset(self): - self.cur = 0 - if self.shuffle: - np.random.shuffle(self.index) - - def iter_next(self): - return self.cur + self.batch_size <= self.size - - def next(self): - if self.iter_next(): - self.get_batch() - self.cur += self.batch_size - return self.im_info, \ - mx.io.DataBatch(data=self.data, label=self.label, - pad=self.getpad(), index=self.getindex(), - provide_data=self.provide_data, provide_label=self.provide_label) - else: - raise StopIteration - - def getindex(self): - return self.cur / self.batch_size - - def getpad(self): - if self.cur + self.batch_size > self.size: - return self.cur + self.batch_size - self.size - else: - return 0 - - def get_batch(self): - cur_from = self.cur - cur_to = min(cur_from + self.batch_size, self.size) - roidb = [self.roidb[self.index[i]] for i in range(cur_from, cur_to)] - if self.has_rpn: - data, label, im_info = get_rpn_testbatch(roidb) - else: - data, label, im_info = get_rcnn_testbatch(roidb) - self.data = [mx.nd.array(data[name]) for name in self.data_name] - self.im_info = im_info - - -class ROIIter(mx.io.DataIter): - def __init__(self, roidb, batch_size=2, shuffle=False, ctx=None, work_load_list=None, aspect_grouping=False): - """ - This Iter will provide roi data to Fast R-CNN network - :param roidb: must be preprocessed - :param batch_size: must divide BATCH_SIZE(128) - :param shuffle: bool - :param ctx: list of contexts - :param work_load_list: list of work load - :param aspect_grouping: group images with similar aspects - :return: ROIIter - """ - super(ROIIter, self).__init__() - - # save parameters as properties - self.roidb = roidb - self.batch_size = batch_size - self.shuffle = shuffle - self.ctx = ctx - if self.ctx is None: - self.ctx = [mx.cpu()] - self.work_load_list = work_load_list - self.aspect_grouping = aspect_grouping - - # infer properties from roidb - self.size = len(roidb) - self.index = np.arange(self.size) - - # decide data and label names (only for training) - self.data_name = ['data', 'rois'] - self.label_name = ['label', 'bbox_target', 'bbox_weight'] - - # status variable for synchronization between get_data and get_label - self.cur = 0 - self.batch = None - self.data = None - self.label = None - - # get first batch to fill in provide_data and provide_label - self.reset() - self.get_batch() - - @property - def provide_data(self): - return [(k, v.shape) for k, v in zip(self.data_name, self.data)] - - @property - def provide_label(self): - return [(k, v.shape) for k, v in zip(self.label_name, self.label)] - - def reset(self): - self.cur = 0 - if self.shuffle: - if self.aspect_grouping: - widths = np.array([r['width'] for r in self.roidb]) - heights = np.array([r['height'] for r in self.roidb]) - horz = (widths >= heights) - vert = np.logical_not(horz) - horz_inds = np.where(horz)[0] - vert_inds = np.where(vert)[0] - # Avoid putting different aspect ratio image into the same bucket, - # which may cause bucketing warning. - pad_horz = self.batch_size - len(horz_inds) % self.batch_size - pad_vert = self.batch_size - len(vert_inds) % self.batch_size - horz_inds = np.hstack([horz_inds, horz_inds[:pad_horz]]) - vert_inds = np.hstack([vert_inds, vert_inds[:pad_vert]]) - inds = np.hstack((np.random.permutation(horz_inds), np.random.permutation(vert_inds))) - inds = np.reshape(inds[:], (-1, self.batch_size)) - row_perm = np.random.permutation(np.arange(inds.shape[0])) - inds = np.reshape(inds[row_perm, :], (-1,)) - self.index = inds - else: - np.random.shuffle(self.index) - - def iter_next(self): - return self.cur + self.batch_size <= self.size - - def next(self): - if self.iter_next(): - self.get_batch() - self.cur += self.batch_size - return mx.io.DataBatch(data=self.data, label=self.label, - pad=self.getpad(), index=self.getindex(), - provide_data=self.provide_data, provide_label=self.provide_label) - else: - raise StopIteration - - def getindex(self): - return self.cur / self.batch_size - - def getpad(self): - if self.cur + self.batch_size > self.size: - return self.cur + self.batch_size - self.size - else: - return 0 - - def get_batch(self): - # slice roidb - cur_from = self.cur - cur_to = min(cur_from + self.batch_size, self.size) - roidb = [self.roidb[self.index[i]] for i in range(cur_from, cur_to)] - - # decide multi device slices - work_load_list = self.work_load_list - ctx = self.ctx - if work_load_list is None: - work_load_list = [1] * len(ctx) - assert isinstance(work_load_list, list) and len(work_load_list) == len(ctx), \ - "Invalid settings for work load. " - slices = _split_input_slice(self.batch_size, work_load_list) - - # get each device - data_list = [] - label_list = [] - for islice in slices: - iroidb = [roidb[i] for i in range(islice.start, islice.stop)] - data, label = get_rcnn_batch(iroidb) - data_list.append(data) - label_list.append(label) - - all_data = dict() - for key in data_list[0].keys(): - all_data[key] = tensor_vstack([batch[key] for batch in data_list]) - - all_label = dict() - for key in label_list[0].keys(): - all_label[key] = tensor_vstack([batch[key] for batch in label_list]) - - self.data = [mx.nd.array(all_data[name]) for name in self.data_name] - self.label = [mx.nd.array(all_label[name]) for name in self.label_name] - - -class AnchorLoader(mx.io.DataIter): - def __init__(self, feat_sym, roidb, batch_size=1, shuffle=False, ctx=None, work_load_list=None, - feat_stride=16, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2), allowed_border=0, - aspect_grouping=False): - """ - This Iter will provide roi data to Fast R-CNN network - :param feat_sym: to infer shape of assign_output - :param roidb: must be preprocessed - :param batch_size: must divide BATCH_SIZE(128) - :param shuffle: bool - :param ctx: list of contexts - :param work_load_list: list of work load - :param aspect_grouping: group images with similar aspects - :return: AnchorLoader - """ - super(AnchorLoader, self).__init__() - - # save parameters as properties - self.feat_sym = feat_sym - self.roidb = roidb - self.batch_size = batch_size - self.shuffle = shuffle - self.ctx = ctx - if self.ctx is None: - self.ctx = [mx.cpu()] - self.work_load_list = work_load_list - self.feat_stride = feat_stride - self.anchor_scales = anchor_scales - self.anchor_ratios = anchor_ratios - self.allowed_border = allowed_border - self.aspect_grouping = aspect_grouping - - # infer properties from roidb - self.size = len(roidb) - self.index = np.arange(self.size) - - # decide data and label names - if config.TRAIN.END2END: - self.data_name = ['data', 'im_info', 'gt_boxes'] - else: - self.data_name = ['data'] - self.label_name = ['label', 'bbox_target', 'bbox_weight'] - - # status variable for synchronization between get_data and get_label - self.cur = 0 - self.batch = None - self.data = None - self.label = None - - # get first batch to fill in provide_data and provide_label - self.reset() - self.get_batch() - - @property - def provide_data(self): - return [(k, v.shape) for k, v in zip(self.data_name, self.data)] - - @property - def provide_label(self): - return [(k, v.shape) for k, v in zip(self.label_name, self.label)] - - def reset(self): - self.cur = 0 - if self.shuffle: - if self.aspect_grouping: - widths = np.array([r['width'] for r in self.roidb]) - heights = np.array([r['height'] for r in self.roidb]) - horz = (widths >= heights) - vert = np.logical_not(horz) - horz_inds = np.where(horz)[0] - vert_inds = np.where(vert)[0] - inds = np.hstack((np.random.permutation(horz_inds), np.random.permutation(vert_inds))) - extra = inds.shape[0] % self.batch_size - inds_ = np.reshape(inds[:-extra], (-1, self.batch_size)) - row_perm = np.random.permutation(np.arange(inds_.shape[0])) - inds[:-extra] = np.reshape(inds_[row_perm, :], (-1,)) - self.index = inds - else: - np.random.shuffle(self.index) - - def iter_next(self): - return self.cur + self.batch_size <= self.size - - def next(self): - if self.iter_next(): - self.get_batch() - self.cur += self.batch_size - return mx.io.DataBatch(data=self.data, label=self.label, - pad=self.getpad(), index=self.getindex(), - provide_data=self.provide_data, provide_label=self.provide_label) - else: - raise StopIteration - - def getindex(self): - return self.cur / self.batch_size - - def getpad(self): - if self.cur + self.batch_size > self.size: - return self.cur + self.batch_size - self.size - else: - return 0 - - def infer_shape(self, max_data_shape=None, max_label_shape=None): - """ Return maximum data and label shape for single gpu """ - if max_data_shape is None: - max_data_shape = [] - if max_label_shape is None: - max_label_shape = [] - max_shapes = dict(max_data_shape + max_label_shape) - input_batch_size = max_shapes['data'][0] - im_info = [[max_shapes['data'][2], max_shapes['data'][3], 1.0]] - _, feat_shape, _ = self.feat_sym.infer_shape(**max_shapes) - label = assign_anchor(feat_shape[0], np.zeros((0, 5)), im_info, - self.feat_stride, self.anchor_scales, self.anchor_ratios, self.allowed_border) - label = [label[k] for k in self.label_name] - label_shape = [(k, tuple([input_batch_size] + list(v.shape[1:]))) for k, v in zip(self.label_name, label)] - return max_data_shape, label_shape - - def get_batch(self): - # slice roidb - cur_from = self.cur - cur_to = min(cur_from + self.batch_size, self.size) - roidb = [self.roidb[self.index[i]] for i in range(cur_from, cur_to)] - - # decide multi device slice - work_load_list = self.work_load_list - ctx = self.ctx - if work_load_list is None: - work_load_list = [1] * len(ctx) - assert isinstance(work_load_list, list) and len(work_load_list) == len(ctx), \ - "Invalid settings for work load. " - slices = _split_input_slice(self.batch_size, work_load_list) - - # get testing data for multigpu - data_list = [] - label_list = [] - for islice in slices: - iroidb = [roidb[i] for i in range(islice.start, islice.stop)] - data, label = get_rpn_batch(iroidb) - data_list.append(data) - label_list.append(label) - - # pad data first and then assign anchor (read label) - data_tensor = tensor_vstack([batch['data'] for batch in data_list]) - for data, data_pad in zip(data_list, data_tensor): - data['data'] = data_pad[np.newaxis, :] - - new_label_list = [] - for data, label in zip(data_list, label_list): - # infer label shape - data_shape = {k: v.shape for k, v in data.items()} - del data_shape['im_info'] - _, feat_shape, _ = self.feat_sym.infer_shape(**data_shape) - feat_shape = [int(i) for i in feat_shape[0]] - - # add gt_boxes to data for e2e - data['gt_boxes'] = label['gt_boxes'][np.newaxis, :, :] - - # assign anchor for label - label = assign_anchor(feat_shape, label['gt_boxes'], data['im_info'], - self.feat_stride, self.anchor_scales, - self.anchor_ratios, self.allowed_border) - new_label_list.append(label) - - all_data = dict() - for key in self.data_name: - all_data[key] = tensor_vstack([batch[key] for batch in data_list]) - - all_label = dict() - for key in self.label_name: - pad = -1 if key == 'label' else 0 - all_label[key] = tensor_vstack([batch[key] for batch in new_label_list], pad=pad) - - self.data = [mx.nd.array(all_data[key]) for key in self.data_name] - self.label = [mx.nd.array(all_label[key]) for key in self.label_name] diff --git a/example/rcnn/rcnn/core/module.py b/example/rcnn/rcnn/core/module.py deleted file mode 100644 index 337f0f35852b..000000000000 --- a/example/rcnn/rcnn/core/module.py +++ /dev/null @@ -1,232 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""A `MutableModule` implement the `BaseModule` API, and allows input shape -varying with training iterations. If shapes vary, executors will rebind, -using shared arrays from the initial module binded with maximum shape. -""" - -import logging - -from mxnet import context as ctx -from mxnet.initializer import Uniform -from mxnet.module.base_module import BaseModule -from mxnet.module.module import Module - -class MutableModule(BaseModule): - """A mutable module is a module that supports variable input data. - - Parameters - ---------- - symbol : Symbol - data_names : list of str - label_names : list of str - logger : Logger - context : Context or list of Context - work_load_list : list of number - max_data_shapes : list of (name, shape) tuple, designating inputs whose shape vary - max_label_shapes : list of (name, shape) tuple, designating inputs whose shape vary - fixed_param_prefix : list of str, indicating fixed parameters - """ - def __init__(self, symbol, data_names, label_names, - logger=logging, context=ctx.cpu(), work_load_list=None, - max_data_shapes=None, max_label_shapes=None, fixed_param_prefix=None): - super(MutableModule, self).__init__(logger=logger) - self._symbol = symbol - self._data_names = data_names - self._label_names = label_names - self._context = context - self._work_load_list = work_load_list - - self._curr_module = None - self._max_data_shapes = max_data_shapes - self._max_label_shapes = max_label_shapes - self._fixed_param_prefix = fixed_param_prefix - - fixed_param_names = list() - if fixed_param_prefix is not None: - for name in self._symbol.list_arguments(): - for prefix in self._fixed_param_prefix: - if prefix in name: - fixed_param_names.append(name) - self._fixed_param_names = fixed_param_names - - def _reset_bind(self): - self.binded = False - self._curr_module = None - - @property - def data_names(self): - return self._data_names - - @property - def output_names(self): - return self._symbol.list_outputs() - - @property - def data_shapes(self): - assert self.binded - return self._curr_module.data_shapes - - @property - def label_shapes(self): - assert self.binded - return self._curr_module.label_shapes - - @property - def output_shapes(self): - assert self.binded - return self._curr_module.output_shapes - - def get_params(self): - assert self.binded and self.params_initialized - return self._curr_module.get_params() - - def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None, - allow_missing=False, force_init=False, allow_extra=False): - if self.params_initialized and not force_init: - return - assert self.binded, 'call bind before initializing the parameters' - self._curr_module.init_params(initializer=initializer, arg_params=arg_params, - aux_params=aux_params, allow_missing=allow_missing, - force_init=force_init, allow_extra=allow_extra) - self.params_initialized = True - - def bind(self, data_shapes, label_shapes=None, for_training=True, - inputs_need_grad=False, force_rebind=False, shared_module=None): - # in case we already initialized params, keep it - if self.params_initialized: - arg_params, aux_params = self.get_params() - - # force rebinding is typically used when one want to switch from - # training to prediction phase. - if force_rebind: - self._reset_bind() - - if self.binded: - self.logger.warning('Already binded, ignoring bind()') - return - - assert shared_module is None, 'shared_module for MutableModule is not supported' - - self.for_training = for_training - self.inputs_need_grad = inputs_need_grad - self.binded = True - - max_shapes_dict = dict() - if self._max_data_shapes is not None: - max_shapes_dict.update(dict(self._max_data_shapes)) - if self._max_label_shapes is not None: - max_shapes_dict.update(dict(self._max_label_shapes)) - - max_data_shapes = list() - for name, shape in data_shapes: - if name in max_shapes_dict: - max_data_shapes.append((name, max_shapes_dict[name])) - else: - max_data_shapes.append((name, shape)) - - max_label_shapes = list() - if label_shapes is not None: - for name, shape in label_shapes: - if name in max_shapes_dict: - max_label_shapes.append((name, max_shapes_dict[name])) - else: - max_label_shapes.append((name, shape)) - - if len(max_label_shapes) == 0: - max_label_shapes = None - - module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger, - context=self._context, work_load_list=self._work_load_list, - fixed_param_names=self._fixed_param_names) - module.bind(max_data_shapes, max_label_shapes, for_training, inputs_need_grad, - force_rebind=False, shared_module=None) - self._curr_module = module - - # copy back saved params, if already initialized - if self.params_initialized: - self.set_params(arg_params, aux_params) - - def init_optimizer(self, kvstore='local', optimizer='sgd', - optimizer_params=(('learning_rate', 0.01),), force_init=False): - assert self.binded and self.params_initialized - if self.optimizer_initialized and not force_init: - self.logger.warning('optimizer already initialized, ignoring.') - return - - self._curr_module.init_optimizer(kvstore, optimizer, optimizer_params, - force_init=force_init) - self.optimizer_initialized = True - - def forward(self, data_batch, is_train=None): - assert self.binded and self.params_initialized - - # get current_shapes - if self._curr_module.label_shapes is not None: - current_shapes = dict(self._curr_module.data_shapes + self._curr_module.label_shapes) - else: - current_shapes = dict(self._curr_module.data_shapes) - - # get input_shapes - if data_batch.provide_label is not None: - input_shapes = dict(data_batch.provide_data + data_batch.provide_label) - else: - input_shapes = dict(data_batch.provide_data) - - # decide if shape changed - shape_changed = False - for k, v in current_shapes.items(): - if v != input_shapes[k]: - shape_changed = True - - if shape_changed: - module = Module(self._symbol, self._data_names, self._label_names, - logger=self.logger, context=self._context, - work_load_list=self._work_load_list, - fixed_param_names=self._fixed_param_names) - module.bind(data_batch.provide_data, data_batch.provide_label, self._curr_module.for_training, - self._curr_module.inputs_need_grad, force_rebind=False, - shared_module=self._curr_module) - self._curr_module = module - - self._curr_module.forward(data_batch, is_train=is_train) - - def backward(self, out_grads=None): - assert self.binded and self.params_initialized - self._curr_module.backward(out_grads=out_grads) - - def update(self): - assert self.binded and self.params_initialized and self.optimizer_initialized - self._curr_module.update() - - def get_outputs(self, merge_multi_context=True): - assert self.binded and self.params_initialized - return self._curr_module.get_outputs(merge_multi_context=merge_multi_context) - - def get_input_grads(self, merge_multi_context=True): - assert self.binded and self.params_initialized and self.inputs_need_grad - return self._curr_module.get_input_grads(merge_multi_context=merge_multi_context) - - def update_metric(self, eval_metric, labels): - assert self.binded and self.params_initialized - self._curr_module.update_metric(eval_metric, labels) - - def install_monitor(self, mon): - """ Install monitor on all executors """ - assert self.binded - self._curr_module.install_monitor(mon) diff --git a/example/rcnn/rcnn/core/tester.py b/example/rcnn/rcnn/core/tester.py deleted file mode 100644 index a451883f5885..000000000000 --- a/example/rcnn/rcnn/core/tester.py +++ /dev/null @@ -1,282 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -try: - import cPickle as pickle -except ImportError: - import pickle -import os -import time -import mxnet as mx -import numpy as np -from builtins import range - -from .module import MutableModule -from rcnn.logger import logger -from rcnn.config import config -from rcnn.io import image -from rcnn.processing.bbox_transform import bbox_pred, clip_boxes -from rcnn.processing.nms import py_nms_wrapper, cpu_nms_wrapper, gpu_nms_wrapper - - -class Predictor(object): - def __init__(self, symbol, data_names, label_names, - context=mx.cpu(), max_data_shapes=None, - provide_data=None, provide_label=None, - arg_params=None, aux_params=None): - self._mod = MutableModule(symbol, data_names, label_names, - context=context, max_data_shapes=max_data_shapes) - self._mod.bind(provide_data, provide_label, for_training=False) - self._mod.init_params(arg_params=arg_params, aux_params=aux_params) - - def predict(self, data_batch): - self._mod.forward(data_batch) - return dict(zip(self._mod.output_names, self._mod.get_outputs())) - - -def im_proposal(predictor, data_batch, data_names, scale): - data_dict = dict(zip(data_names, data_batch.data)) - output = predictor.predict(data_batch) - - # drop the batch index - boxes = output['rois_output'].asnumpy()[:, 1:] - scores = output['rois_score'].asnumpy() - - # transform to original scale - boxes = boxes / scale - - return scores, boxes, data_dict - - -def generate_proposals(predictor, test_data, imdb, vis=False, thresh=0.): - """ - Generate detections results using RPN. - :param predictor: Predictor - :param test_data: data iterator, must be non-shuffled - :param imdb: image database - :param vis: controls visualization - :param thresh: thresh for valid detections - :return: list of detected boxes - """ - assert vis or not test_data.shuffle - data_names = [k[0] for k in test_data.provide_data] - - i = 0 - t = time.time() - imdb_boxes = list() - original_boxes = list() - for im_info, data_batch in test_data: - t1 = time.time() - t - t = time.time() - - scale = im_info[0, 2] - scores, boxes, data_dict = im_proposal(predictor, data_batch, data_names, scale) - t2 = time.time() - t - t = time.time() - - # assemble proposals - dets = np.hstack((boxes, scores)) - original_boxes.append(dets) - - # filter proposals - keep = np.where(dets[:, 4:] > thresh)[0] - dets = dets[keep, :] - imdb_boxes.append(dets) - - if vis: - vis_all_detection(data_dict['data'].asnumpy(), [dets], ['obj'], scale) - - logger.info('generating %d/%d ' % (i + 1, imdb.num_images) + - 'proposal %d ' % (dets.shape[0]) + - 'data %.4fs net %.4fs' % (t1, t2)) - i += 1 - - assert len(imdb_boxes) == imdb.num_images, 'calculations not complete' - - # save results - rpn_folder = os.path.join(imdb.root_path, 'rpn_data') - if not os.path.exists(rpn_folder): - os.mkdir(rpn_folder) - - rpn_file = os.path.join(rpn_folder, imdb.name + '_rpn.pkl') - with open(rpn_file, 'wb') as f: - pickle.dump(imdb_boxes, f, pickle.HIGHEST_PROTOCOL) - - if thresh > 0: - full_rpn_file = os.path.join(rpn_folder, imdb.name + '_full_rpn.pkl') - with open(full_rpn_file, 'wb') as f: - pickle.dump(original_boxes, f, pickle.HIGHEST_PROTOCOL) - - logger.info('wrote rpn proposals to %s' % rpn_file) - return imdb_boxes - - -def im_detect(predictor, data_batch, data_names, scale): - output = predictor.predict(data_batch) - - data_dict = dict(zip(data_names, data_batch.data)) - if config.TEST.HAS_RPN: - rois = output['rois_output'].asnumpy()[:, 1:] - else: - rois = data_dict['rois'].asnumpy().reshape((-1, 5))[:, 1:] - im_shape = data_dict['data'].shape - - # save output - scores = output['cls_prob_reshape_output'].asnumpy()[0] - bbox_deltas = output['bbox_pred_reshape_output'].asnumpy()[0] - - # post processing - pred_boxes = bbox_pred(rois, bbox_deltas) - pred_boxes = clip_boxes(pred_boxes, im_shape[-2:]) - - # we used scaled image & roi to train, so it is necessary to transform them back - pred_boxes = pred_boxes / scale - - return scores, pred_boxes, data_dict - - -def pred_eval(predictor, test_data, imdb, vis=False, thresh=1e-3): - """ - wrapper for calculating offline validation for faster data analysis - in this example, all threshold are set by hand - :param predictor: Predictor - :param test_data: data iterator, must be non-shuffle - :param imdb: image database - :param vis: controls visualization - :param thresh: valid detection threshold - :return: - """ - assert vis or not test_data.shuffle - data_names = [k[0] for k in test_data.provide_data] - - nms = py_nms_wrapper(config.TEST.NMS) - - # limit detections to max_per_image over all classes - max_per_image = -1 - - num_images = imdb.num_images - # all detections are collected into: - # all_boxes[cls][image] = N x 5 array of detections in - # (x1, y1, x2, y2, score) - all_boxes = [[[] for _ in range(num_images)] - for _ in range(imdb.num_classes)] - - i = 0 - t = time.time() - for im_info, data_batch in test_data: - t1 = time.time() - t - t = time.time() - - scale = im_info[0, 2] - scores, boxes, data_dict = im_detect(predictor, data_batch, data_names, scale) - - t2 = time.time() - t - t = time.time() - - for j in range(1, imdb.num_classes): - indexes = np.where(scores[:, j] > thresh)[0] - cls_scores = scores[indexes, j, np.newaxis] - cls_boxes = boxes[indexes, j * 4:(j + 1) * 4] - cls_dets = np.hstack((cls_boxes, cls_scores)) - keep = nms(cls_dets) - all_boxes[j][i] = cls_dets[keep, :] - - if max_per_image > 0: - image_scores = np.hstack([all_boxes[j][i][:, -1] - for j in range(1, imdb.num_classes)]) - if len(image_scores) > max_per_image: - image_thresh = np.sort(image_scores)[-max_per_image] - for j in range(1, imdb.num_classes): - keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] - all_boxes[j][i] = all_boxes[j][i][keep, :] - - if vis: - boxes_this_image = [[]] + [all_boxes[j][i] for j in range(1, imdb.num_classes)] - vis_all_detection(data_dict['data'].asnumpy(), boxes_this_image, imdb.classes, scale) - - t3 = time.time() - t - t = time.time() - logger.info('testing %d/%d data %.4fs net %.4fs post %.4fs' % (i, imdb.num_images, t1, t2, t3)) - i += 1 - - det_file = os.path.join(imdb.cache_path, imdb.name + '_detections.pkl') - with open(det_file, 'wb') as f: - pickle.dump(all_boxes, f, protocol=pickle.HIGHEST_PROTOCOL) - - imdb.evaluate_detections(all_boxes) - - -def vis_all_detection(im_array, detections, class_names, scale): - """ - visualize all detections in one image - :param im_array: [b=1 c h w] in rgb - :param detections: [ numpy.ndarray([[x1 y1 x2 y2 score]]) for j in classes ] - :param class_names: list of names in imdb - :param scale: visualize the scaled image - :return: - """ - import matplotlib.pyplot as plt - import random - im = image.transform_inverse(im_array, config.PIXEL_MEANS) - plt.imshow(im) - for j, name in enumerate(class_names): - if name == '__background__': - continue - color = (random.random(), random.random(), random.random()) # generate a random color - dets = detections[j] - for det in dets: - bbox = det[:4] * scale - score = det[-1] - rect = plt.Rectangle((bbox[0], bbox[1]), - bbox[2] - bbox[0], - bbox[3] - bbox[1], fill=False, - edgecolor=color, linewidth=3.5) - plt.gca().add_patch(rect) - plt.gca().text(bbox[0], bbox[1] - 2, - '{:s} {:.3f}'.format(name, score), - bbox=dict(facecolor=color, alpha=0.5), fontsize=12, color='white') - plt.show() - - -def draw_all_detection(im_array, detections, class_names, scale): - """ - visualize all detections in one image - :param im_array: [b=1 c h w] in rgb - :param detections: [ numpy.ndarray([[x1 y1 x2 y2 score]]) for j in classes ] - :param class_names: list of names in imdb - :param scale: visualize the scaled image - :return: - """ - import cv2 - import random - color_white = (255, 255, 255) - im = image.transform_inverse(im_array, config.PIXEL_MEANS) - # change to bgr - im = cv2.cvtColor(im, cv2.cv.CV_RGB2BGR) - for j, name in enumerate(class_names): - if name == '__background__': - continue - color = (random.randint(0, 256), random.randint(0, 256), random.randint(0, 256)) # generate a random color - dets = detections[j] - for det in dets: - bbox = det[:4] * scale - score = det[-1] - bbox = map(int, bbox) - cv2.rectangle(im, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color=color, thickness=2) - cv2.putText(im, '%s %.3f' % (class_names[j], score), (bbox[0], bbox[1] + 10), - color=color_white, fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=0.5) - return im diff --git a/example/rcnn/rcnn/cython/.gitignore b/example/rcnn/rcnn/cython/.gitignore deleted file mode 100644 index 15a165d42716..000000000000 --- a/example/rcnn/rcnn/cython/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.c -*.cpp -*.so diff --git a/example/rcnn/rcnn/cython/bbox.pyx b/example/rcnn/rcnn/cython/bbox.pyx deleted file mode 100644 index 0c49e120e5ab..000000000000 --- a/example/rcnn/rcnn/cython/bbox.pyx +++ /dev/null @@ -1,55 +0,0 @@ -# -------------------------------------------------------- -# Fast R-CNN -# Copyright (c) 2015 Microsoft -# Licensed under The MIT License [see LICENSE for details] -# Written by Sergey Karayev -# -------------------------------------------------------- - -cimport cython -import numpy as np -cimport numpy as np - -DTYPE = np.float -ctypedef np.float_t DTYPE_t - -def bbox_overlaps_cython( - np.ndarray[DTYPE_t, ndim=2] boxes, - np.ndarray[DTYPE_t, ndim=2] query_boxes): - """ - Parameters - ---------- - boxes: (N, 4) ndarray of float - query_boxes: (K, 4) ndarray of float - Returns - ------- - overlaps: (N, K) ndarray of overlap between boxes and query_boxes - """ - cdef unsigned int N = boxes.shape[0] - cdef unsigned int K = query_boxes.shape[0] - cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) - cdef DTYPE_t iw, ih, box_area - cdef DTYPE_t ua - cdef unsigned int k, n - for k in range(K): - box_area = ( - (query_boxes[k, 2] - query_boxes[k, 0] + 1) * - (query_boxes[k, 3] - query_boxes[k, 1] + 1) - ) - for n in range(N): - iw = ( - min(boxes[n, 2], query_boxes[k, 2]) - - max(boxes[n, 0], query_boxes[k, 0]) + 1 - ) - if iw > 0: - ih = ( - min(boxes[n, 3], query_boxes[k, 3]) - - max(boxes[n, 1], query_boxes[k, 1]) + 1 - ) - if ih > 0: - ua = float( - (boxes[n, 2] - boxes[n, 0] + 1) * - (boxes[n, 3] - boxes[n, 1] + 1) + - box_area - iw * ih - ) - overlaps[n, k] = iw * ih / ua - return overlaps diff --git a/example/rcnn/rcnn/cython/cpu_nms.pyx b/example/rcnn/rcnn/cython/cpu_nms.pyx deleted file mode 100644 index 1d0bef3321d7..000000000000 --- a/example/rcnn/rcnn/cython/cpu_nms.pyx +++ /dev/null @@ -1,68 +0,0 @@ -# -------------------------------------------------------- -# Fast R-CNN -# Copyright (c) 2015 Microsoft -# Licensed under The MIT License [see LICENSE for details] -# Written by Ross Girshick -# -------------------------------------------------------- - -import numpy as np -cimport numpy as np - -cdef inline np.float32_t max(np.float32_t a, np.float32_t b): - return a if a >= b else b - -cdef inline np.float32_t min(np.float32_t a, np.float32_t b): - return a if a <= b else b - -def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): - cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] - cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] - cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] - cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] - cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] - - cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) - cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] - - cdef int ndets = dets.shape[0] - cdef np.ndarray[np.int_t, ndim=1] suppressed = \ - np.zeros((ndets), dtype=np.int) - - # nominal indices - cdef int _i, _j - # sorted indices - cdef int i, j - # temp variables for box i's (the box currently under consideration) - cdef np.float32_t ix1, iy1, ix2, iy2, iarea - # variables for computing overlap with box j (lower scoring box) - cdef np.float32_t xx1, yy1, xx2, yy2 - cdef np.float32_t w, h - cdef np.float32_t inter, ovr - - keep = [] - for _i in range(ndets): - i = order[_i] - if suppressed[i] == 1: - continue - keep.append(i) - ix1 = x1[i] - iy1 = y1[i] - ix2 = x2[i] - iy2 = y2[i] - iarea = areas[i] - for _j in range(_i + 1, ndets): - j = order[_j] - if suppressed[j] == 1: - continue - xx1 = max(ix1, x1[j]) - yy1 = max(iy1, y1[j]) - xx2 = min(ix2, x2[j]) - yy2 = min(iy2, y2[j]) - w = max(0.0, xx2 - xx1 + 1) - h = max(0.0, yy2 - yy1 + 1) - inter = w * h - ovr = inter / (iarea + areas[j] - inter) - if ovr >= thresh: - suppressed[j] = 1 - - return keep diff --git a/example/rcnn/rcnn/cython/gpu_nms.hpp b/example/rcnn/rcnn/cython/gpu_nms.hpp deleted file mode 100644 index 93d1f90183bb..000000000000 --- a/example/rcnn/rcnn/cython/gpu_nms.hpp +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, - int boxes_dim, float nms_overlap_thresh, int device_id); diff --git a/example/rcnn/rcnn/cython/gpu_nms.pyx b/example/rcnn/rcnn/cython/gpu_nms.pyx deleted file mode 100644 index 59d84afe94e4..000000000000 --- a/example/rcnn/rcnn/cython/gpu_nms.pyx +++ /dev/null @@ -1,31 +0,0 @@ -# -------------------------------------------------------- -# Faster R-CNN -# Copyright (c) 2015 Microsoft -# Licensed under The MIT License [see LICENSE for details] -# Written by Ross Girshick -# -------------------------------------------------------- - -import numpy as np -cimport numpy as np - -assert sizeof(int) == sizeof(np.int32_t) - -cdef extern from "gpu_nms.hpp": - void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) - -def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, - np.int32_t device_id=0): - cdef int boxes_num = dets.shape[0] - cdef int boxes_dim = dets.shape[1] - cdef int num_out - cdef np.ndarray[np.int32_t, ndim=1] \ - keep = np.zeros(boxes_num, dtype=np.int32) - cdef np.ndarray[np.float32_t, ndim=1] \ - scores = dets[:, 4] - cdef np.ndarray[np.int_t, ndim=1] \ - order = scores.argsort()[::-1] - cdef np.ndarray[np.float32_t, ndim=2] \ - sorted_dets = dets[order, :] - _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) - keep = keep[:num_out] - return list(order[keep]) diff --git a/example/rcnn/rcnn/cython/nms_kernel.cu b/example/rcnn/rcnn/cython/nms_kernel.cu deleted file mode 100644 index 038a59012f60..000000000000 --- a/example/rcnn/rcnn/cython/nms_kernel.cu +++ /dev/null @@ -1,144 +0,0 @@ -// ------------------------------------------------------------------ -// Faster R-CNN -// Copyright (c) 2015 Microsoft -// Licensed under The MIT License [see fast-rcnn/LICENSE for details] -// Written by Shaoqing Ren -// ------------------------------------------------------------------ - -#include "gpu_nms.hpp" -#include -#include - -#define CUDA_CHECK(condition) \ - /* Code block avoids redefinition of cudaError_t error */ \ - do { \ - cudaError_t error = condition; \ - if (error != cudaSuccess) { \ - std::cout << cudaGetErrorString(error) << std::endl; \ - } \ - } while (0) - -#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) -int const threadsPerBlock = sizeof(unsigned long long) * 8; - -__device__ inline float devIoU(float const * const a, float const * const b) { - float left = max(a[0], b[0]), right = min(a[2], b[2]); - float top = max(a[1], b[1]), bottom = min(a[3], b[3]); - float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); - float interS = width * height; - float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); - float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); - return interS / (Sa + Sb - interS); -} - -__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, - const float *dev_boxes, unsigned long long *dev_mask) { - const int row_start = blockIdx.y; - const int col_start = blockIdx.x; - - // if (row_start > col_start) return; - - const int row_size = - min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); - const int col_size = - min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); - - __shared__ float block_boxes[threadsPerBlock * 5]; - if (threadIdx.x < col_size) { - block_boxes[threadIdx.x * 5 + 0] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; - block_boxes[threadIdx.x * 5 + 1] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; - block_boxes[threadIdx.x * 5 + 2] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; - block_boxes[threadIdx.x * 5 + 3] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; - block_boxes[threadIdx.x * 5 + 4] = - dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; - } - __syncthreads(); - - if (threadIdx.x < row_size) { - const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; - const float *cur_box = dev_boxes + cur_box_idx * 5; - int i = 0; - unsigned long long t = 0; - int start = 0; - if (row_start == col_start) { - start = threadIdx.x + 1; - } - for (i = start; i < col_size; i++) { - if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { - t |= 1ULL << i; - } - } - const int col_blocks = DIVUP(n_boxes, threadsPerBlock); - dev_mask[cur_box_idx * col_blocks + col_start] = t; - } -} - -void _set_device(int device_id) { - int current_device; - CUDA_CHECK(cudaGetDevice(¤t_device)); - if (current_device == device_id) { - return; - } - // The call to cudaSetDevice must come before any calls to Get, which - // may perform initialization using the GPU. - CUDA_CHECK(cudaSetDevice(device_id)); -} - -void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, - int boxes_dim, float nms_overlap_thresh, int device_id) { - _set_device(device_id); - - float* boxes_dev = NULL; - unsigned long long* mask_dev = NULL; - - const int col_blocks = DIVUP(boxes_num, threadsPerBlock); - - CUDA_CHECK(cudaMalloc(&boxes_dev, - boxes_num * boxes_dim * sizeof(float))); - CUDA_CHECK(cudaMemcpy(boxes_dev, - boxes_host, - boxes_num * boxes_dim * sizeof(float), - cudaMemcpyHostToDevice)); - - CUDA_CHECK(cudaMalloc(&mask_dev, - boxes_num * col_blocks * sizeof(unsigned long long))); - - dim3 blocks(DIVUP(boxes_num, threadsPerBlock), - DIVUP(boxes_num, threadsPerBlock)); - dim3 threads(threadsPerBlock); - nms_kernel<<>>(boxes_num, - nms_overlap_thresh, - boxes_dev, - mask_dev); - - std::vector mask_host(boxes_num * col_blocks); - CUDA_CHECK(cudaMemcpy(&mask_host[0], - mask_dev, - sizeof(unsigned long long) * boxes_num * col_blocks, - cudaMemcpyDeviceToHost)); - - std::vector remv(col_blocks); - memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); - - int num_to_keep = 0; - for (int i = 0; i < boxes_num; i++) { - int nblock = i / threadsPerBlock; - int inblock = i % threadsPerBlock; - - if (!(remv[nblock] & (1ULL << inblock))) { - keep_out[num_to_keep++] = i; - unsigned long long *p = &mask_host[0] + i * col_blocks; - for (int j = nblock; j < col_blocks; j++) { - remv[j] |= p[j]; - } - } - } - *num_out = num_to_keep; - - CUDA_CHECK(cudaFree(boxes_dev)); - CUDA_CHECK(cudaFree(mask_dev)); -} diff --git a/example/rcnn/rcnn/cython/setup.py b/example/rcnn/rcnn/cython/setup.py deleted file mode 100644 index 2646398e0e18..000000000000 --- a/example/rcnn/rcnn/cython/setup.py +++ /dev/null @@ -1,163 +0,0 @@ -# -------------------------------------------------------- -# Fast R-CNN -# Copyright (c) 2015 Microsoft -# Licensed under The MIT License [see LICENSE for details] -# Written by Ross Girshick -# -------------------------------------------------------- - -import os -from os.path import join as pjoin -from setuptools import setup -from distutils.extension import Extension -from Cython.Distutils import build_ext -import numpy as np - - -def find_in_path(name, path): - "Find a file in a search path" - # Adapted fom - # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ - for dir in path.split(os.pathsep): - binpath = pjoin(dir, name) - if os.path.exists(binpath): - return os.path.abspath(binpath) - return None - - -def locate_cuda(): - """Locate the CUDA environment on the system - - Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' - and values giving the absolute path to each directory. - - Starts by looking for the CUDAHOME env variable. If not found, everything - is based on finding 'nvcc' in the PATH. - """ - - # first check if the CUDAHOME env variable is in use - if 'CUDAHOME' in os.environ: - home = os.environ['CUDAHOME'] - nvcc = pjoin(home, 'bin', 'nvcc') - else: - # otherwise, search the PATH for NVCC - default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') - nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) - if nvcc is None: - raise EnvironmentError('The nvcc binary could not be ' - 'located in your $PATH. Either add it to your path, or set $CUDAHOME') - home = os.path.dirname(os.path.dirname(nvcc)) - - cudaconfig = {'home':home, 'nvcc':nvcc, - 'include': pjoin(home, 'include'), - 'lib64': pjoin(home, 'lib64')} - for k, v in cudaconfig.items(): - if not os.path.exists(v): - raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) - - return cudaconfig - - -# Test if cuda could be foun -try: - CUDA = locate_cuda() -except EnvironmentError: - CUDA = None - - -# Obtain the numpy include directory. This logic works across numpy versions. -try: - numpy_include = np.get_include() -except AttributeError: - numpy_include = np.get_numpy_include() - - -def customize_compiler_for_nvcc(self): - """inject deep into distutils to customize how the dispatch - to gcc/nvcc works. - - If you subclass UnixCCompiler, it's not trivial to get your subclass - injected in, and still have the right customizations (i.e. - distutils.sysconfig.customize_compiler) run on it. So instead of going - the OO route, I have this. Note, it's kindof like a wierd functional - subclassing going on.""" - - # tell the compiler it can processes .cu - self.src_extensions.append('.cu') - - # save references to the default compiler_so and _comple methods - default_compiler_so = self.compiler_so - super = self._compile - - # now redefine the _compile method. This gets executed for each - # object but distutils doesn't have the ability to change compilers - # based on source extension: we add it. - def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): - if os.path.splitext(src)[1] == '.cu': - # use the cuda for .cu files - self.set_executable('compiler_so', CUDA['nvcc']) - # use only a subset of the extra_postargs, which are 1-1 translated - # from the extra_compile_args in the Extension class - postargs = extra_postargs['nvcc'] - else: - postargs = extra_postargs['gcc'] - - super(obj, src, ext, cc_args, postargs, pp_opts) - # reset the default compiler_so, which we might have changed for cuda - self.compiler_so = default_compiler_so - - # inject our redefined _compile method into the class - self._compile = _compile - - -# run the customize_compiler -class custom_build_ext(build_ext): - def build_extensions(self): - customize_compiler_for_nvcc(self.compiler) - build_ext.build_extensions(self) - - -ext_modules = [ - Extension( - "bbox", - ["bbox.pyx"], - extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, - include_dirs=[numpy_include] - ), - Extension( - "cpu_nms", - ["cpu_nms.pyx"], - extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, - include_dirs = [numpy_include] - ), -] - -if CUDA is not None: - ext_modules.append( - Extension('gpu_nms', - ['nms_kernel.cu', 'gpu_nms.pyx'], - library_dirs=[CUDA['lib64']], - libraries=['cudart'], - language='c++', - runtime_library_dirs=[CUDA['lib64']], - # this syntax is specific to this build system - # we're only going to use certain compiler args with nvcc and not with - # gcc the implementation of this trick is in customize_compiler() below - extra_compile_args={'gcc': ["-Wno-unused-function"], - 'nvcc': ['-arch=sm_35', - '--ptxas-options=-v', - '-c', - '--compiler-options', - "'-fPIC'"]}, - include_dirs = [numpy_include, CUDA['include']] - ) - ) -else: - print('Skipping GPU_NMS') - - -setup( - name='frcnn_cython', - ext_modules=ext_modules, - # inject our custom trigger - cmdclass={'build_ext': custom_build_ext}, -) diff --git a/example/rcnn/rcnn/dataset/__init__.py b/example/rcnn/rcnn/dataset/__init__.py deleted file mode 100644 index 80fcc32c21a0..000000000000 --- a/example/rcnn/rcnn/dataset/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from .imdb import IMDB -from .pascal_voc import PascalVOC -from .coco import coco diff --git a/example/rcnn/rcnn/dataset/imdb.py b/example/rcnn/rcnn/dataset/imdb.py deleted file mode 100644 index 5908cc3358eb..000000000000 --- a/example/rcnn/rcnn/dataset/imdb.py +++ /dev/null @@ -1,314 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -General image database -An image database creates a list of relative image path called image_set_index and -transform index to absolute image path. As to training, it is necessary that ground -truth and proposals are mixed together for training. -roidb -basic format [image_index] -['image', 'height', 'width', 'flipped', -'boxes', 'gt_classes', 'gt_overlaps', 'max_classes', 'max_overlaps', 'bbox_targets'] -""" - -from ..logger import logger -import os -try: - import cPickle as pickle -except ImportError: - import pickle -import numpy as np -from ..processing.bbox_transform import bbox_overlaps - - -class IMDB(object): - def __init__(self, name, image_set, root_path, dataset_path): - """ - basic information about an image database - :param name: name of image database will be used for any output - :param root_path: root path store cache and proposal data - :param dataset_path: dataset path store images and image lists - """ - self.name = name + '_' + image_set - self.image_set = image_set - self.root_path = root_path - self.data_path = dataset_path - - # abstract attributes - self.classes = [] - self.num_classes = 0 - self.image_set_index = [] - self.num_images = 0 - - self.config = {} - - def image_path_from_index(self, index): - raise NotImplementedError - - def gt_roidb(self): - raise NotImplementedError - - def evaluate_detections(self, detections): - raise NotImplementedError - - @property - def cache_path(self): - """ - make a directory to store all caches - :return: cache path - """ - cache_path = os.path.join(self.root_path, 'cache') - if not os.path.exists(cache_path): - os.mkdir(cache_path) - return cache_path - - def image_path_at(self, index): - """ - access image at index in image database - :param index: image index in image database - :return: image path - """ - return self.image_path_from_index(self.image_set_index[index]) - - def load_rpn_data(self, full=False): - if full: - rpn_file = os.path.join(self.root_path, 'rpn_data', self.name + '_full_rpn.pkl') - else: - rpn_file = os.path.join(self.root_path, 'rpn_data', self.name + '_rpn.pkl') - assert os.path.exists(rpn_file), '%s rpn data not found at %s' % (self.name, rpn_file) - logger.info('%s loading rpn data from %s' % (self.name, rpn_file)) - with open(rpn_file, 'rb') as f: - box_list = pickle.load(f) - return box_list - - def load_rpn_roidb(self, gt_roidb): - """ - turn rpn detection boxes into roidb - :param gt_roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] - :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] - """ - box_list = self.load_rpn_data() - return self.create_roidb_from_box_list(box_list, gt_roidb) - - def rpn_roidb(self, gt_roidb, append_gt=False): - """ - get rpn roidb and ground truth roidb - :param gt_roidb: ground truth roidb - :param append_gt: append ground truth - :return: roidb of rpn - """ - if append_gt: - logger.info('%s appending ground truth annotations' % self.name) - rpn_roidb = self.load_rpn_roidb(gt_roidb) - roidb = IMDB.merge_roidbs(gt_roidb, rpn_roidb) - else: - roidb = self.load_rpn_roidb(gt_roidb) - return roidb - - def create_roidb_from_box_list(self, box_list, gt_roidb): - """ - given ground truth, prepare roidb - :param box_list: [image_index] ndarray of [box_index][x1, x2, y1, y2] - :param gt_roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] - :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] - """ - assert len(box_list) == self.num_images, 'number of boxes matrix must match number of images' - roidb = [] - for i in range(self.num_images): - roi_rec = dict() - roi_rec['image'] = gt_roidb[i]['image'] - roi_rec['height'] = gt_roidb[i]['height'] - roi_rec['width'] = gt_roidb[i]['width'] - - boxes = box_list[i] - if boxes.shape[1] == 5: - boxes = boxes[:, :4] - num_boxes = boxes.shape[0] - overlaps = np.zeros((num_boxes, self.num_classes), dtype=np.float32) - if gt_roidb is not None and gt_roidb[i]['boxes'].size > 0: - gt_boxes = gt_roidb[i]['boxes'] - gt_classes = gt_roidb[i]['gt_classes'] - # n boxes and k gt_boxes => n * k overlap - gt_overlaps = bbox_overlaps(boxes.astype(np.float), gt_boxes.astype(np.float)) - # for each box in n boxes, select only maximum overlap (must be greater than zero) - argmaxes = gt_overlaps.argmax(axis=1) - maxes = gt_overlaps.max(axis=1) - I = np.where(maxes > 0)[0] - overlaps[I, gt_classes[argmaxes[I]]] = maxes[I] - - roi_rec.update({'boxes': boxes, - 'gt_classes': np.zeros((num_boxes,), dtype=np.int32), - 'gt_overlaps': overlaps, - 'max_classes': overlaps.argmax(axis=1), - 'max_overlaps': overlaps.max(axis=1), - 'flipped': False}) - - # background roi => background class - zero_indexes = np.where(roi_rec['max_overlaps'] == 0)[0] - assert all(roi_rec['max_classes'][zero_indexes] == 0) - # foreground roi => foreground class - nonzero_indexes = np.where(roi_rec['max_overlaps'] > 0)[0] - assert all(roi_rec['max_classes'][nonzero_indexes] != 0) - - roidb.append(roi_rec) - - return roidb - - def append_flipped_images(self, roidb): - """ - append flipped images to an roidb - flip boxes coordinates, images will be actually flipped when loading into network - :param roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] - :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] - """ - logger.info('%s append flipped images to roidb' % self.name) - assert self.num_images == len(roidb) - for i in range(self.num_images): - roi_rec = roidb[i] - boxes = roi_rec['boxes'].copy() - oldx1 = boxes[:, 0].copy() - oldx2 = boxes[:, 2].copy() - boxes[:, 0] = roi_rec['width'] - oldx2 - 1 - boxes[:, 2] = roi_rec['width'] - oldx1 - 1 - assert (boxes[:, 2] >= boxes[:, 0]).all() - entry = {'image': roi_rec['image'], - 'height': roi_rec['height'], - 'width': roi_rec['width'], - 'boxes': boxes, - 'gt_classes': roidb[i]['gt_classes'], - 'gt_overlaps': roidb[i]['gt_overlaps'], - 'max_classes': roidb[i]['max_classes'], - 'max_overlaps': roidb[i]['max_overlaps'], - 'flipped': True} - roidb.append(entry) - - self.image_set_index *= 2 - return roidb - - def evaluate_recall(self, roidb, candidate_boxes=None, thresholds=None): - """ - evaluate detection proposal recall metrics - record max overlap value for each gt box; return vector of overlap values - :param roidb: used to evaluate - :param candidate_boxes: if not given, use roidb's non-gt boxes - :param thresholds: array-like recall threshold - :return: None - ar: average recall, recalls: vector recalls at each IoU overlap threshold - thresholds: vector of IoU overlap threshold, gt_overlaps: vector of all ground-truth overlaps - """ - area_names = ['all', '0-25', '25-50', '50-100', - '100-200', '200-300', '300-inf'] - area_ranges = [[0**2, 1e5**2], [0**2, 25**2], [25**2, 50**2], [50**2, 100**2], - [100**2, 200**2], [200**2, 300**2], [300**2, 1e5**2]] - area_counts = [] - for area_name, area_range in zip(area_names[1:], area_ranges[1:]): - area_count = 0 - for i in range(self.num_images): - if candidate_boxes is None: - # default is use the non-gt boxes from roidb - non_gt_inds = np.where(roidb[i]['gt_classes'] == 0)[0] - boxes = roidb[i]['boxes'][non_gt_inds, :] - else: - boxes = candidate_boxes[i] - boxes_areas = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1) - valid_range_inds = np.where((boxes_areas >= area_range[0]) & (boxes_areas < area_range[1]))[0] - area_count += len(valid_range_inds) - area_counts.append(area_count) - total_counts = float(sum(area_counts)) - for area_name, area_count in zip(area_names[1:], area_counts): - logger.info('percentage of %s is %f' % (area_name, area_count / total_counts)) - logger.info('average number of proposal is %f' % (total_counts / self.num_images)) - for area_name, area_range in zip(area_names, area_ranges): - gt_overlaps = np.zeros(0) - num_pos = 0 - for i in range(self.num_images): - # check for max_overlaps == 1 avoids including crowd annotations - max_gt_overlaps = roidb[i]['gt_overlaps'].max(axis=1) - gt_inds = np.where((roidb[i]['gt_classes'] > 0) & (max_gt_overlaps == 1))[0] - gt_boxes = roidb[i]['boxes'][gt_inds, :] - gt_areas = (gt_boxes[:, 2] - gt_boxes[:, 0] + 1) * (gt_boxes[:, 3] - gt_boxes[:, 1] + 1) - valid_gt_inds = np.where((gt_areas >= area_range[0]) & (gt_areas < area_range[1]))[0] - gt_boxes = gt_boxes[valid_gt_inds, :] - num_pos += len(valid_gt_inds) - - if candidate_boxes is None: - # default is use the non-gt boxes from roidb - non_gt_inds = np.where(roidb[i]['gt_classes'] == 0)[0] - boxes = roidb[i]['boxes'][non_gt_inds, :] - else: - boxes = candidate_boxes[i] - if boxes.shape[0] == 0: - continue - - overlaps = bbox_overlaps(boxes.astype(np.float), gt_boxes.astype(np.float)) - - _gt_overlaps = np.zeros((gt_boxes.shape[0])) - # choose whatever is smaller to iterate - rounds = min(boxes.shape[0], gt_boxes.shape[0]) - for j in range(rounds): - # find which proposal maximally covers each gt box - argmax_overlaps = overlaps.argmax(axis=0) - # get the IoU amount of coverage for each gt box - max_overlaps = overlaps.max(axis=0) - # find which gt box is covered by most IoU - gt_ind = max_overlaps.argmax() - gt_ovr = max_overlaps.max() - assert (gt_ovr >= 0), '%s\n%s\n%s' % (boxes, gt_boxes, overlaps) - # find the proposal box that covers the best covered gt box - box_ind = argmax_overlaps[gt_ind] - # record the IoU coverage of this gt box - _gt_overlaps[j] = overlaps[box_ind, gt_ind] - assert (_gt_overlaps[j] == gt_ovr) - # mark the proposal box and the gt box as used - overlaps[box_ind, :] = -1 - overlaps[:, gt_ind] = -1 - # append recorded IoU coverage level - gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps)) - - gt_overlaps = np.sort(gt_overlaps) - if thresholds is None: - step = 0.05 - thresholds = np.arange(0.5, 0.95 + 1e-5, step) - recalls = np.zeros_like(thresholds) - - # compute recall for each IoU threshold - for i, t in enumerate(thresholds): - recalls[i] = (gt_overlaps >= t).sum() / float(num_pos) - ar = recalls.mean() - - # print results - print('average recall for {}: {:.3f}'.format(area_name, ar)) - for threshold, recall in zip(thresholds, recalls): - print('recall @{:.2f}: {:.3f}'.format(threshold, recall)) - - @staticmethod - def merge_roidbs(a, b): - """ - merge roidbs into one - :param a: roidb to be merged into - :param b: roidb to be merged - :return: merged imdb - """ - assert len(a) == len(b) - for i in range(len(a)): - a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes'])) - a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'], b[i]['gt_classes'])) - a[i]['gt_overlaps'] = np.vstack((a[i]['gt_overlaps'], b[i]['gt_overlaps'])) - a[i]['max_classes'] = np.hstack((a[i]['max_classes'], b[i]['max_classes'])) - a[i]['max_overlaps'] = np.hstack((a[i]['max_overlaps'], b[i]['max_overlaps'])) - return a diff --git a/example/rcnn/rcnn/dataset/pascal_voc.py b/example/rcnn/rcnn/dataset/pascal_voc.py deleted file mode 100644 index 753f7038aa67..000000000000 --- a/example/rcnn/rcnn/dataset/pascal_voc.py +++ /dev/null @@ -1,280 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Pascal VOC database -This class loads ground truth notations from standard Pascal VOC XML data formats -and transform them into IMDB format. Selective search is used for proposals, see roidb -function. Results are written as the Pascal VOC format. Evaluation is based on mAP -criterion. -""" - -try: - import cPickle as pickle -except ImportError: - import pickle -import cv2 -import os -import numpy as np - -from ..logger import logger -from .imdb import IMDB -from .pascal_voc_eval import voc_eval -from .ds_utils import unique_boxes, filter_small_boxes - - -class PascalVOC(IMDB): - def __init__(self, image_set, root_path, devkit_path): - """ - fill basic information to initialize imdb - :param image_set: 2007_trainval, 2007_test, etc - :param root_path: 'selective_search_data' and 'cache' - :param devkit_path: data and results - :return: imdb object - """ - year, image_set = image_set.split('_') - super(PascalVOC, self).__init__('voc_' + year, image_set, root_path, devkit_path) # set self.name - self.year = year - self.root_path = root_path - self.devkit_path = devkit_path - self.data_path = os.path.join(devkit_path, 'VOC' + year) - - self.classes = ['__background__', # always index 0 - 'aeroplane', 'bicycle', 'bird', 'boat', - 'bottle', 'bus', 'car', 'cat', 'chair', - 'cow', 'diningtable', 'dog', 'horse', - 'motorbike', 'person', 'pottedplant', - 'sheep', 'sofa', 'train', 'tvmonitor'] - self.num_classes = len(self.classes) - self.image_set_index = self.load_image_set_index() - self.num_images = len(self.image_set_index) - logger.info('%s num_images %d' % (self.name, self.num_images)) - - self.config = {'comp_id': 'comp4', - 'use_diff': False, - 'min_size': 2} - - def load_image_set_index(self): - """ - find out which indexes correspond to given image set (train or val) - :return: - """ - image_set_index_file = os.path.join(self.data_path, 'ImageSets', 'Main', self.image_set + '.txt') - assert os.path.exists(image_set_index_file), 'Path does not exist: {}'.format(image_set_index_file) - with open(image_set_index_file) as f: - image_set_index = [x.strip() for x in f.readlines()] - return image_set_index - - def image_path_from_index(self, index): - """ - given image index, find out full path - :param index: index of a specific image - :return: full path of this image - """ - image_file = os.path.join(self.data_path, 'JPEGImages', index + '.jpg') - assert os.path.exists(image_file), 'Path does not exist: {}'.format(image_file) - return image_file - - def gt_roidb(self): - """ - return ground truth image regions database - :return: imdb[image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] - """ - cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') - if os.path.exists(cache_file): - with open(cache_file, 'rb') as fid: - roidb = pickle.load(fid) - logger.info('%s gt roidb loaded from %s' % (self.name, cache_file)) - return roidb - - gt_roidb = [self.load_pascal_annotation(index) for index in self.image_set_index] - with open(cache_file, 'wb') as fid: - pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL) - logger.info('%s wrote gt roidb to %s' % (self.name, cache_file)) - - return gt_roidb - - def load_pascal_annotation(self, index): - """ - for a given index, load image and bounding boxes info from XML file - :param index: index of a specific image - :return: record['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] - """ - import xml.etree.ElementTree as ET - roi_rec = dict() - roi_rec['image'] = self.image_path_from_index(index) - size = cv2.imread(roi_rec['image']).shape - roi_rec['height'] = size[0] - roi_rec['width'] = size[1] - - filename = os.path.join(self.data_path, 'Annotations', index + '.xml') - tree = ET.parse(filename) - objs = tree.findall('object') - if not self.config['use_diff']: - non_diff_objs = [obj for obj in objs if int(obj.find('difficult').text) == 0] - objs = non_diff_objs - num_objs = len(objs) - - boxes = np.zeros((num_objs, 4), dtype=np.uint16) - gt_classes = np.zeros((num_objs), dtype=np.int32) - overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) - - class_to_index = dict(zip(self.classes, range(self.num_classes))) - # Load object bounding boxes into a data frame. - for ix, obj in enumerate(objs): - bbox = obj.find('bndbox') - # Make pixel indexes 0-based - x1 = float(bbox.find('xmin').text) - 1 - y1 = float(bbox.find('ymin').text) - 1 - x2 = float(bbox.find('xmax').text) - 1 - y2 = float(bbox.find('ymax').text) - 1 - cls = class_to_index[obj.find('name').text.lower().strip()] - boxes[ix, :] = [x1, y1, x2, y2] - gt_classes[ix] = cls - overlaps[ix, cls] = 1.0 - - roi_rec.update({'boxes': boxes, - 'gt_classes': gt_classes, - 'gt_overlaps': overlaps, - 'max_classes': overlaps.argmax(axis=1), - 'max_overlaps': overlaps.max(axis=1), - 'flipped': False}) - return roi_rec - - def load_selective_search_roidb(self, gt_roidb): - """ - turn selective search proposals into selective search roidb - :param gt_roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] - :return: roidb: [image_index]['boxes', 'gt_classes', 'gt_overlaps', 'flipped'] - """ - import scipy.io - matfile = os.path.join(self.root_path, 'selective_search_data', self.name + '.mat') - assert os.path.exists(matfile), 'selective search data does not exist: {}'.format(matfile) - raw_data = scipy.io.loadmat(matfile)['boxes'].ravel() # original was dict ['images', 'boxes'] - - box_list = [] - for i in range(raw_data.shape[0]): - boxes = raw_data[i][:, (1, 0, 3, 2)] - 1 # pascal voc dataset starts from 1. - keep = unique_boxes(boxes) - boxes = boxes[keep, :] - keep = filter_small_boxes(boxes, self.config['min_size']) - boxes = boxes[keep, :] - box_list.append(boxes) - - return self.create_roidb_from_box_list(box_list, gt_roidb) - - def selective_search_roidb(self, gt_roidb, append_gt=False): - """ - get selective search roidb and ground truth roidb - :param gt_roidb: ground truth roidb - :param append_gt: append ground truth - :return: roidb of selective search - """ - cache_file = os.path.join(self.cache_path, self.name + '_ss_roidb.pkl') - if os.path.exists(cache_file): - with open(cache_file, 'rb') as fid: - roidb = pickle.load(fid) - logger.info('%s ss roidb loaded from %s' % (self.name, cache_file)) - return roidb - - if append_gt: - logger.info('%s appending ground truth annotations' % self.name) - ss_roidb = self.load_selective_search_roidb(gt_roidb) - roidb = IMDB.merge_roidbs(gt_roidb, ss_roidb) - else: - roidb = self.load_selective_search_roidb(gt_roidb) - with open(cache_file, 'wb') as fid: - pickle.dump(roidb, fid, pickle.HIGHEST_PROTOCOL) - logger.info('%s wrote ss roidb to %s' % (self.name, cache_file)) - - return roidb - - def evaluate_detections(self, detections): - """ - top level evaluations - :param detections: result matrix, [bbox, confidence] - :return: None - """ - # make all these folders for results - result_dir = os.path.join(self.devkit_path, 'results') - if not os.path.exists(result_dir): - os.mkdir(result_dir) - year_folder = os.path.join(self.devkit_path, 'results', 'VOC' + self.year) - if not os.path.exists(year_folder): - os.mkdir(year_folder) - res_file_folder = os.path.join(self.devkit_path, 'results', 'VOC' + self.year, 'Main') - if not os.path.exists(res_file_folder): - os.mkdir(res_file_folder) - - self.write_pascal_results(detections) - self.do_python_eval() - - def get_result_file_template(self): - """ - this is a template - VOCdevkit/results/VOC2007/Main/_det_test_aeroplane.txt - :return: a string template - """ - res_file_folder = os.path.join(self.devkit_path, 'results', 'VOC' + self.year, 'Main') - comp_id = self.config['comp_id'] - filename = comp_id + '_det_' + self.image_set + '_{:s}.txt' - path = os.path.join(res_file_folder, filename) - return path - - def write_pascal_results(self, all_boxes): - """ - write results files in pascal devkit path - :param all_boxes: boxes to be processed [bbox, confidence] - :return: None - """ - for cls_ind, cls in enumerate(self.classes): - if cls == '__background__': - continue - logger.info('Writing %s VOC results file' % cls) - filename = self.get_result_file_template().format(cls) - with open(filename, 'wt') as f: - for im_ind, index in enumerate(self.image_set_index): - dets = all_boxes[cls_ind][im_ind] - if len(dets) == 0: - continue - # the VOCdevkit expects 1-based indices - for k in range(dets.shape[0]): - f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. - format(index, dets[k, -1], - dets[k, 0] + 1, dets[k, 1] + 1, dets[k, 2] + 1, dets[k, 3] + 1)) - - def do_python_eval(self): - """ - python evaluation wrapper - :return: None - """ - annopath = os.path.join(self.data_path, 'Annotations', '{0!s}.xml') - imageset_file = os.path.join(self.data_path, 'ImageSets', 'Main', self.image_set + '.txt') - annocache = os.path.join(self.cache_path, self.name + '_annotations.pkl') - aps = [] - # The PASCAL VOC metric changed in 2010 - use_07_metric = True if int(self.year) < 2010 else False - logger.info('VOC07 metric? ' + ('Y' if use_07_metric else 'No')) - for cls_ind, cls in enumerate(self.classes): - if cls == '__background__': - continue - filename = self.get_result_file_template().format(cls) - rec, prec, ap = voc_eval(filename, annopath, imageset_file, cls, annocache, - ovthresh=0.5, use_07_metric=use_07_metric) - aps += [ap] - logger.info('AP for {} = {:.4f}'.format(cls, ap)) - logger.info('Mean AP = {:.4f}'.format(np.mean(aps))) diff --git a/example/rcnn/rcnn/dataset/pascal_voc_eval.py b/example/rcnn/rcnn/dataset/pascal_voc_eval.py deleted file mode 100644 index 2583aed16679..000000000000 --- a/example/rcnn/rcnn/dataset/pascal_voc_eval.py +++ /dev/null @@ -1,194 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -given a pascal voc imdb, compute mAP -""" - -from ..logger import logger -import numpy as np -import os -try: - import cPickle as pickle -except ImportError: - import pickle - -def parse_voc_rec(filename): - """ - parse pascal voc record into a dictionary - :param filename: xml file path - :return: list of dict - """ - import xml.etree.ElementTree as ET - tree = ET.parse(filename) - objects = [] - for obj in tree.findall('object'): - obj_dict = dict() - obj_dict['name'] = obj.find('name').text - obj_dict['difficult'] = int(obj.find('difficult').text) - bbox = obj.find('bndbox') - obj_dict['bbox'] = [int(float(bbox.find('xmin').text)), - int(float(bbox.find('ymin').text)), - int(float(bbox.find('xmax').text)), - int(float(bbox.find('ymax').text))] - objects.append(obj_dict) - return objects - - -def voc_ap(rec, prec, use_07_metric=False): - """ - average precision calculations - [precision integrated to recall] - :param rec: recall - :param prec: precision - :param use_07_metric: 2007 metric is 11-recall-point based AP - :return: average precision - """ - if use_07_metric: - ap = 0. - for t in np.arange(0., 1.1, 0.1): - if np.sum(rec >= t) == 0: - p = 0 - else: - p = np.max(prec[rec >= t]) - ap += p / 11. - else: - # append sentinel values at both ends - mrec = np.concatenate(([0.], rec, [1.])) - mpre = np.concatenate(([0.], prec, [0.])) - - # compute precision integration ladder - for i in range(mpre.size - 1, 0, -1): - mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) - - # look for recall value changes - i = np.where(mrec[1:] != mrec[:-1])[0] - - # sum (\delta recall) * prec - ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) - return ap - - -def voc_eval(detpath, annopath, imageset_file, classname, annocache, ovthresh=0.5, use_07_metric=False): - """ - pascal voc evaluation - :param detpath: detection results detpath.format(classname) - :param annopath: annotations annopath.format(classname) - :param imageset_file: text file containing list of images - :param classname: category name - :param annocache: caching annotations - :param ovthresh: overlap threshold - :param use_07_metric: whether to use voc07's 11 point ap computation - :return: rec, prec, ap - """ - with open(imageset_file, 'r') as f: - lines = f.readlines() - image_filenames = [x.strip() for x in lines] - - # load annotations from cache - if not os.path.isfile(annocache): - recs = {} - for ind, image_filename in enumerate(image_filenames): - recs[image_filename] = parse_voc_rec(annopath.format(image_filename)) - if ind % 100 == 0: - logger.info('reading annotations for %d/%d' % (ind + 1, len(image_filenames))) - logger.info('saving annotations cache to %s' % annocache) - with open(annocache, 'wb') as f: - pickle.dump(recs, f, protocol=pickle.HIGHEST_PROTOCOL) - else: - with open(annocache, 'rb') as f: - recs = pickle.load(f) - - # extract objects in :param classname: - class_recs = {} - npos = 0 - for image_filename in image_filenames: - objects = [obj for obj in recs[image_filename] if obj['name'] == classname] - bbox = np.array([x['bbox'] for x in objects]) - difficult = np.array([x['difficult'] for x in objects]).astype(np.bool) - det = [False] * len(objects) # stand for detected - npos = npos + sum(~difficult) - class_recs[image_filename] = {'bbox': bbox, - 'difficult': difficult, - 'det': det} - - # read detections - detfile = detpath.format(classname) - with open(detfile, 'r') as f: - lines = f.readlines() - - splitlines = [x.strip().split(' ') for x in lines] - image_ids = [x[0] for x in splitlines] - confidence = np.array([float(x[1]) for x in splitlines]) - bbox = np.array([[float(z) for z in x[2:]] for x in splitlines]) - - # sort by confidence - if bbox.shape[0] > 0: - sorted_inds = np.argsort(-confidence) - sorted_scores = np.sort(-confidence) - bbox = bbox[sorted_inds, :] - image_ids = [image_ids[x] for x in sorted_inds] - - # go down detections and mark true positives and false positives - nd = len(image_ids) - tp = np.zeros(nd) - fp = np.zeros(nd) - for d in range(nd): - r = class_recs[image_ids[d]] - bb = bbox[d, :].astype(float) - ovmax = -np.inf - bbgt = r['bbox'].astype(float) - - if bbgt.size > 0: - # compute overlaps - # intersection - ixmin = np.maximum(bbgt[:, 0], bb[0]) - iymin = np.maximum(bbgt[:, 1], bb[1]) - ixmax = np.minimum(bbgt[:, 2], bb[2]) - iymax = np.minimum(bbgt[:, 3], bb[3]) - iw = np.maximum(ixmax - ixmin + 1., 0.) - ih = np.maximum(iymax - iymin + 1., 0.) - inters = iw * ih - - # union - uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + - (bbgt[:, 2] - bbgt[:, 0] + 1.) * - (bbgt[:, 3] - bbgt[:, 1] + 1.) - inters) - - overlaps = inters / uni - ovmax = np.max(overlaps) - jmax = np.argmax(overlaps) - - if ovmax > ovthresh: - if not r['difficult'][jmax]: - if not r['det'][jmax]: - tp[d] = 1. - r['det'][jmax] = 1 - else: - fp[d] = 1. - else: - fp[d] = 1. - - # compute precision recall - fp = np.cumsum(fp) - tp = np.cumsum(tp) - rec = tp / float(npos) - # avoid division by zero in case first detection matches a difficult ground ruth - prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) - ap = voc_ap(rec, prec, use_07_metric) - - return rec, prec, ap diff --git a/example/rcnn/rcnn/io/__init__.py b/example/rcnn/rcnn/io/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/example/rcnn/rcnn/io/image.py b/example/rcnn/rcnn/io/image.py deleted file mode 100644 index 1b83300a47d6..000000000000 --- a/example/rcnn/rcnn/io/image.py +++ /dev/null @@ -1,157 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np -import cv2 -import os -import random -from ..config import config - - -def get_image(roidb): - """ - preprocess image and return processed roidb - :param roidb: a list of roidb - :return: list of img as in mxnet format - roidb add new item['im_info'] - 0 --- x (width, second dim of im) - | - y (height, first dim of im) - """ - num_images = len(roidb) - processed_ims = [] - processed_roidb = [] - for i in range(num_images): - roi_rec = roidb[i] - assert os.path.exists(roi_rec['image']), '{} does not exist'.format(roi_rec['image']) - im = cv2.imread(roi_rec['image']) - if roidb[i]['flipped']: - im = im[:, ::-1, :] - new_rec = roi_rec.copy() - scale_ind = random.randrange(len(config.SCALES)) - target_size = config.SCALES[scale_ind][0] - max_size = config.SCALES[scale_ind][1] - im, im_scale = resize(im, target_size, max_size, stride=config.IMAGE_STRIDE) - im_tensor = transform(im, config.PIXEL_MEANS) - processed_ims.append(im_tensor) - im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale] - new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale - new_rec['im_info'] = im_info - processed_roidb.append(new_rec) - return processed_ims, processed_roidb - - -def resize(im, target_size, max_size, stride=0): - """ - only resize input image to target size and return scale - :param im: BGR image input by opencv - :param target_size: one dimensional size (the short side) - :param max_size: one dimensional max size (the long side) - :param stride: if given, pad the image to designated stride - :return: - """ - im_shape = im.shape - im_size_min = np.min(im_shape[0:2]) - im_size_max = np.max(im_shape[0:2]) - im_scale = float(target_size) / float(im_size_min) - # prevent bigger axis from being more than max_size: - if np.round(im_scale * im_size_max) > max_size: - im_scale = float(max_size) / float(im_size_max) - im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) - - if stride == 0: - return im, im_scale - else: - # pad to product of stride - im_height = int(np.ceil(im.shape[0] / float(stride)) * stride) - im_width = int(np.ceil(im.shape[1] / float(stride)) * stride) - im_channel = im.shape[2] - padded_im = np.zeros((im_height, im_width, im_channel)) - padded_im[:im.shape[0], :im.shape[1], :] = im - return padded_im, im_scale - - -def transform(im, pixel_means): - """ - transform into mxnet tensor, - subtract pixel size and transform to correct format - :param im: [height, width, channel] in BGR - :param pixel_means: [B, G, R pixel means] - :return: [batch, channel, height, width] - """ - im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1])) - for i in range(3): - im_tensor[0, i, :, :] = im[:, :, 2 - i] - pixel_means[2 - i] - return im_tensor - - -def transform_inverse(im_tensor, pixel_means): - """ - transform from mxnet im_tensor to ordinary RGB image - im_tensor is limited to one image - :param im_tensor: [batch, channel, height, width] - :param pixel_means: [B, G, R pixel means] - :return: im [height, width, channel(RGB)] - """ - assert im_tensor.shape[0] == 1 - im_tensor = im_tensor.copy() - # put channel back - channel_swap = (0, 2, 3, 1) - im_tensor = im_tensor.transpose(channel_swap) - im = im_tensor[0] - assert im.shape[2] == 3 - im += pixel_means[[2, 1, 0]] - im = im.astype(np.uint8) - return im - - -def tensor_vstack(tensor_list, pad=0): - """ - vertically stack tensors - :param tensor_list: list of tensor to be stacked vertically - :param pad: label to pad with - :return: tensor with max shape - """ - ndim = len(tensor_list[0].shape) - dtype = tensor_list[0].dtype - islice = tensor_list[0].shape[0] - dimensions = [] - first_dim = sum([tensor.shape[0] for tensor in tensor_list]) - dimensions.append(first_dim) - for dim in range(1, ndim): - dimensions.append(max([tensor.shape[dim] for tensor in tensor_list])) - if pad == 0: - all_tensor = np.zeros(tuple(dimensions), dtype=dtype) - elif pad == 1: - all_tensor = np.ones(tuple(dimensions), dtype=dtype) - else: - all_tensor = np.full(tuple(dimensions), pad, dtype=dtype) - if ndim == 1: - for ind, tensor in enumerate(tensor_list): - all_tensor[ind*islice:(ind+1)*islice] = tensor - elif ndim == 2: - for ind, tensor in enumerate(tensor_list): - all_tensor[ind*islice:(ind+1)*islice, :tensor.shape[1]] = tensor - elif ndim == 3: - for ind, tensor in enumerate(tensor_list): - all_tensor[ind*islice:(ind+1)*islice, :tensor.shape[1], :tensor.shape[2]] = tensor - elif ndim == 4: - for ind, tensor in enumerate(tensor_list): - all_tensor[ind*islice:(ind+1)*islice, :tensor.shape[1], :tensor.shape[2], :tensor.shape[3]] = tensor - else: - raise Exception('Sorry, unimplemented.') - return all_tensor diff --git a/example/rcnn/rcnn/io/rcnn.py b/example/rcnn/rcnn/io/rcnn.py deleted file mode 100644 index d11c7cadace4..000000000000 --- a/example/rcnn/rcnn/io/rcnn.py +++ /dev/null @@ -1,194 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Fast R-CNN: -data = - {'data': [num_images, c, h, w], - 'rois': [num_rois, 5]} -label = - {'label': [num_rois], - 'bbox_target': [num_rois, 4 * num_classes], - 'bbox_weight': [num_rois, 4 * num_classes]} -roidb extended format [image_index] - ['image', 'height', 'width', 'flipped', - 'boxes', 'gt_classes', 'gt_overlaps', 'max_classes', 'max_overlaps', 'bbox_targets'] -""" - -import numpy as np -import numpy.random as npr - -from ..config import config -from ..io.image import get_image, tensor_vstack -from ..processing.bbox_transform import bbox_overlaps, bbox_transform -from ..processing.bbox_regression import expand_bbox_regression_targets - - -def get_rcnn_testbatch(roidb): - """ - return a dict of testbatch - :param roidb: ['image', 'flipped'] + ['boxes'] - :return: data, label, im_info - """ - assert len(roidb) == 1, 'Single batch only' - imgs, roidb = get_image(roidb) - im_array = imgs[0] - im_info = np.array([roidb[0]['im_info']], dtype=np.float32) - - im_rois = roidb[0]['boxes'] - rois = im_rois - batch_index = 0 * np.ones((rois.shape[0], 1)) - rois_array = np.hstack((batch_index, rois))[np.newaxis, :] - - data = {'data': im_array, - 'rois': rois_array} - label = {} - - return data, label, im_info - - -def get_rcnn_batch(roidb): - """ - return a dict of multiple images - :param roidb: a list of dict, whose length controls batch size - ['images', 'flipped'] + ['gt_boxes', 'boxes', 'gt_overlap'] => ['bbox_targets'] - :return: data, label - """ - num_images = len(roidb) - imgs, roidb = get_image(roidb) - im_array = tensor_vstack(imgs) - - assert config.TRAIN.BATCH_ROIS % config.TRAIN.BATCH_IMAGES == 0, \ - 'BATCHIMAGES {} must divide BATCH_ROIS {}'.format(config.TRAIN.BATCH_IMAGES, config.TRAIN.BATCH_ROIS) - rois_per_image = config.TRAIN.BATCH_ROIS / config.TRAIN.BATCH_IMAGES - fg_rois_per_image = np.round(config.TRAIN.FG_FRACTION * rois_per_image).astype(np.int) - - rois_array = list() - labels_array = list() - bbox_targets_array = list() - bbox_weights_array = list() - - for im_i in range(num_images): - roi_rec = roidb[im_i] - - # infer num_classes from gt_overlaps - num_classes = roi_rec['gt_overlaps'].shape[1] - - # label = class RoI has max overlap with - rois = roi_rec['boxes'] - labels = roi_rec['max_classes'] - overlaps = roi_rec['max_overlaps'] - bbox_targets = roi_rec['bbox_targets'] - - im_rois, labels, bbox_targets, bbox_weights = \ - sample_rois(rois, fg_rois_per_image, rois_per_image, num_classes, - labels, overlaps, bbox_targets) - - # project im_rois - # do not round roi - rois = im_rois - batch_index = im_i * np.ones((rois.shape[0], 1)) - rois_array_this_image = np.hstack((batch_index, rois)) - rois_array.append(rois_array_this_image) - - # add labels - labels_array.append(labels) - bbox_targets_array.append(bbox_targets) - bbox_weights_array.append(bbox_weights) - - rois_array = np.array(rois_array) - labels_array = np.array(labels_array) - bbox_targets_array = np.array(bbox_targets_array) - bbox_weights_array = np.array(bbox_weights_array) - - data = {'data': im_array, - 'rois': rois_array} - label = {'label': labels_array, - 'bbox_target': bbox_targets_array, - 'bbox_weight': bbox_weights_array} - - return data, label - - -def sample_rois(rois, fg_rois_per_image, rois_per_image, num_classes, - labels=None, overlaps=None, bbox_targets=None, gt_boxes=None): - """ - generate random sample of ROIs comprising foreground and background examples - :param rois: all_rois [n, 4]; e2e: [n, 5] with batch_index - :param fg_rois_per_image: foreground roi number - :param rois_per_image: total roi number - :param num_classes: number of classes - :param labels: maybe precomputed - :param overlaps: maybe precomputed (max_overlaps) - :param bbox_targets: maybe precomputed - :param gt_boxes: optional for e2e [n, 5] (x1, y1, x2, y2, cls) - :return: (labels, rois, bbox_targets, bbox_weights) - """ - if labels is None: - overlaps = bbox_overlaps(rois[:, 1:].astype(np.float), gt_boxes[:, :4].astype(np.float)) - gt_assignment = overlaps.argmax(axis=1) - overlaps = overlaps.max(axis=1) - labels = gt_boxes[gt_assignment, 4] - - # foreground RoI with FG_THRESH overlap - fg_indexes = np.where(overlaps >= config.TRAIN.FG_THRESH)[0] - # guard against the case when an image has fewer than fg_rois_per_image foreground RoIs - fg_rois_per_this_image = int(np.minimum(fg_rois_per_image, fg_indexes.size)) - # Sample foreground regions without replacement - if len(fg_indexes) > fg_rois_per_this_image: - fg_indexes = npr.choice(fg_indexes, size=fg_rois_per_this_image, replace=False) - - # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) - bg_indexes = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0] - # Compute number of background RoIs to take from this image (guarding against there being fewer than desired) - bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image - bg_rois_per_this_image = int(np.minimum(bg_rois_per_this_image, bg_indexes.size)) - # Sample foreground regions without replacement - if len(bg_indexes) > bg_rois_per_this_image: - bg_indexes = npr.choice(bg_indexes, size=bg_rois_per_this_image, replace=False) - - # indexes selected - keep_indexes = np.append(fg_indexes, bg_indexes) - neg_idx = np.where(overlaps < config.TRAIN.FG_THRESH)[0] - neg_rois = rois[neg_idx] - # pad more to ensure a fixed minibatch size - while keep_indexes.shape[0] < rois_per_image: - gap = np.minimum(len(neg_rois), rois_per_image - keep_indexes.shape[0]) - gap_indexes = npr.choice(range(len(neg_rois)), size=gap, replace=False) - keep_indexes = np.append(keep_indexes, neg_idx[gap_indexes]) - - # select labels - labels = labels[keep_indexes] - # set labels of bg_rois to be 0 - labels[fg_rois_per_this_image:] = 0 - rois = rois[keep_indexes] - - # load or compute bbox_target - if bbox_targets is not None: - bbox_target_data = bbox_targets[keep_indexes, :] - else: - targets = bbox_transform(rois[:, 1:], gt_boxes[gt_assignment[keep_indexes], :4]) - if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED: - targets = ((targets - np.array(config.TRAIN.BBOX_MEANS)) - / np.array(config.TRAIN.BBOX_STDS)) - bbox_target_data = np.hstack((labels[:, np.newaxis], targets)) - - bbox_targets, bbox_weights = \ - expand_bbox_regression_targets(bbox_target_data, num_classes) - - return rois, labels, bbox_targets, bbox_weights - diff --git a/example/rcnn/rcnn/io/rpn.py b/example/rcnn/rcnn/io/rpn.py deleted file mode 100644 index 59dd615aaa39..000000000000 --- a/example/rcnn/rcnn/io/rpn.py +++ /dev/null @@ -1,244 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -RPN: -data = - {'data': [num_images, c, h, w], - 'im_info': [num_images, 4] (optional)} -label = - {'gt_boxes': [num_boxes, 5] (optional), - 'label': [batch_size, 1] <- [batch_size, num_anchors, feat_height, feat_width], - 'bbox_target': [batch_size, num_anchors, feat_height, feat_width], - 'bbox_weight': [batch_size, num_anchors, feat_height, feat_width]} -""" - -import logging -import numpy as np -import numpy.random as npr - -from ..logger import logger -from ..config import config -from .image import get_image, tensor_vstack -from ..processing.generate_anchor import generate_anchors -from ..processing.bbox_transform import bbox_overlaps, bbox_transform - - -def get_rpn_testbatch(roidb): - """ - return a dict of testbatch - :param roidb: ['image', 'flipped'] - :return: data, label, im_info - """ - assert len(roidb) == 1, 'Single batch only' - imgs, roidb = get_image(roidb) - im_array = imgs[0] - im_info = np.array([roidb[0]['im_info']], dtype=np.float32) - - data = {'data': im_array, - 'im_info': im_info} - label = {} - - return data, label, im_info - - -def get_rpn_batch(roidb): - """ - prototype for rpn batch: data, im_info, gt_boxes - :param roidb: ['image', 'flipped'] + ['gt_boxes', 'boxes', 'gt_classes'] - :return: data, label - """ - assert len(roidb) == 1, 'Single batch only' - imgs, roidb = get_image(roidb) - im_array = imgs[0] - im_info = np.array([roidb[0]['im_info']], dtype=np.float32) - - # gt boxes: (x1, y1, x2, y2, cls) - if roidb[0]['gt_classes'].size > 0: - gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] - gt_boxes = np.empty((roidb[0]['boxes'].shape[0], 5), dtype=np.float32) - gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] - gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] - else: - gt_boxes = np.empty((0, 5), dtype=np.float32) - - data = {'data': im_array, - 'im_info': im_info} - label = {'gt_boxes': gt_boxes} - - return data, label - - -def assign_anchor(feat_shape, gt_boxes, im_info, feat_stride=16, - scales=(8, 16, 32), ratios=(0.5, 1, 2), allowed_border=0): - """ - assign ground truth boxes to anchor positions - :param feat_shape: infer output shape - :param gt_boxes: assign ground truth - :param im_info: filter out anchors overlapped with edges - :param feat_stride: anchor position step - :param scales: used to generate anchors, affects num_anchors (per location) - :param ratios: aspect ratios of generated anchors - :param allowed_border: filter out anchors with edge overlap > allowed_border - :return: dict of label - 'label': of shape (batch_size, 1) <- (batch_size, num_anchors, feat_height, feat_width) - 'bbox_target': of shape (batch_size, num_anchors * 4, feat_height, feat_width) - 'bbox_inside_weight': *todo* mark the assigned anchors - 'bbox_outside_weight': used to normalize the bbox_loss, all weights sums to RPN_POSITIVE_WEIGHT - """ - def _unmap(data, count, inds, fill=0): - """" unmap a subset inds of data into original data of size count """ - if len(data.shape) == 1: - ret = np.empty((count,), dtype=np.float32) - ret.fill(fill) - ret[inds] = data - else: - ret = np.empty((count,) + data.shape[1:], dtype=np.float32) - ret.fill(fill) - ret[inds, :] = data - return ret - - im_info = im_info[0] - scales = np.array(scales, dtype=np.float32) - base_anchors = generate_anchors(base_size=feat_stride, ratios=list(ratios), scales=scales) - num_anchors = base_anchors.shape[0] - feat_height, feat_width = feat_shape[-2:] - - logger.debug('anchors: %s' % base_anchors) - logger.debug('anchor shapes: %s' % np.hstack((base_anchors[:, 2::4] - base_anchors[:, 0::4], - base_anchors[:, 3::4] - base_anchors[:, 1::4]))) - logger.debug('im_info %s' % im_info) - logger.debug('height %d width %d' % (feat_height, feat_width)) - logger.debug('gt_boxes shape %s' % np.array(gt_boxes.shape)) - logger.debug('gt_boxes %s' % gt_boxes) - - # 1. generate proposals from bbox deltas and shifted anchors - shift_x = np.arange(0, feat_width) * feat_stride - shift_y = np.arange(0, feat_height) * feat_stride - shift_x, shift_y = np.meshgrid(shift_x, shift_y) - shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() - # add A anchors (1, A, 4) to - # cell K shifts (K, 1, 4) to get - # shift anchors (K, A, 4) - # reshape to (K*A, 4) shifted anchors - A = num_anchors - K = shifts.shape[0] - all_anchors = base_anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)) - all_anchors = all_anchors.reshape((K * A, 4)) - total_anchors = int(K * A) - - # only keep anchors inside the image - inds_inside = np.where((all_anchors[:, 0] >= -allowed_border) & - (all_anchors[:, 1] >= -allowed_border) & - (all_anchors[:, 2] < im_info[1] + allowed_border) & - (all_anchors[:, 3] < im_info[0] + allowed_border))[0] - logger.debug('total_anchors %d' % total_anchors) - logger.debug('inds_inside %d' % len(inds_inside)) - - # keep only inside anchors - anchors = all_anchors[inds_inside, :] - logger.debug('anchors shape %s' % np.array(anchors.shape)) - - # label: 1 is positive, 0 is negative, -1 is dont care - labels = np.empty((len(inds_inside),), dtype=np.float32) - labels.fill(-1) - - if gt_boxes.size > 0: - # overlap between the anchors and the gt boxes - # overlaps (ex, gt) - overlaps = bbox_overlaps(anchors.astype(np.float), gt_boxes.astype(np.float)) - argmax_overlaps = overlaps.argmax(axis=1) - max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] - gt_argmax_overlaps = overlaps.argmax(axis=0) - gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] - gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] - - if not config.TRAIN.RPN_CLOBBER_POSITIVES: - # assign bg labels first so that positive labels can clobber them - labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 - - # fg label: for each gt, anchor with highest overlap - labels[gt_argmax_overlaps] = 1 - - # fg label: above threshold IoU - labels[max_overlaps >= config.TRAIN.RPN_POSITIVE_OVERLAP] = 1 - - if config.TRAIN.RPN_CLOBBER_POSITIVES: - # assign bg labels last so that negative labels can clobber positives - labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 - else: - labels[:] = 0 - - # subsample positive labels if we have too many - num_fg = int(config.TRAIN.RPN_FG_FRACTION * config.TRAIN.RPN_BATCH_SIZE) - fg_inds = np.where(labels == 1)[0] - if len(fg_inds) > num_fg: - disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False) - if logger.level == logging.DEBUG: - disable_inds = fg_inds[:(len(fg_inds) - num_fg)] - labels[disable_inds] = -1 - - # subsample negative labels if we have too many - num_bg = config.TRAIN.RPN_BATCH_SIZE - np.sum(labels == 1) - bg_inds = np.where(labels == 0)[0] - if len(bg_inds) > num_bg: - disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False) - if logger.level == logging.DEBUG: - disable_inds = bg_inds[:(len(bg_inds) - num_bg)] - labels[disable_inds] = -1 - - bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) - if gt_boxes.size > 0: - bbox_targets[:] = bbox_transform(anchors, gt_boxes[argmax_overlaps, :4]) - - bbox_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) - bbox_weights[labels == 1, :] = np.array(config.TRAIN.RPN_BBOX_WEIGHTS) - - if logger.level == logging.DEBUG: - _sums = bbox_targets[labels == 1, :].sum(axis=0) - _squared_sums = (bbox_targets[labels == 1, :] ** 2).sum(axis=0) - _counts = np.sum(labels == 1) - means = _sums / (_counts + 1e-14) - stds = np.sqrt(_squared_sums / _counts - means ** 2) - logger.debug('means %s' % means) - logger.debug('stdevs %s' % stds) - - # map up to original set of anchors - labels = _unmap(labels, total_anchors, inds_inside, fill=-1) - bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) - bbox_weights = _unmap(bbox_weights, total_anchors, inds_inside, fill=0) - - if logger.level == logging.DEBUG: - if gt_boxes.size > 0: - logger.debug('rpn: max max_overlaps %f' % np.max(max_overlaps)) - logger.debug('rpn: num_positives %f' % np.sum(labels == 1)) - logger.debug('rpn: num_negatives %f' % np.sum(labels == 0)) - _fg_sum = np.sum(labels == 1) - _bg_sum = np.sum(labels == 0) - _count = 1 - logger.debug('rpn: num_positive avg %f' % (_fg_sum / _count)) - logger.debug('rpn: num_negative avg %f' % (_bg_sum / _count)) - - labels = labels.reshape((1, feat_height, feat_width, A)).transpose(0, 3, 1, 2) - labels = labels.reshape((1, A * feat_height * feat_width)) - bbox_targets = bbox_targets.reshape((1, feat_height, feat_width, A * 4)).transpose(0, 3, 1, 2) - bbox_weights = bbox_weights.reshape((1, feat_height, feat_width, A * 4)).transpose((0, 3, 1, 2)) - - label = {'label': labels, - 'bbox_target': bbox_targets, - 'bbox_weight': bbox_weights} - return label diff --git a/example/rcnn/rcnn/processing/__init__.py b/example/rcnn/rcnn/processing/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/example/rcnn/rcnn/processing/bbox_regression.py b/example/rcnn/rcnn/processing/bbox_regression.py deleted file mode 100644 index 24812ac2bda2..000000000000 --- a/example/rcnn/rcnn/processing/bbox_regression.py +++ /dev/null @@ -1,137 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -This file has functions about generating bounding box regression targets -""" - -import numpy as np - -from ..logger import logger -from .bbox_transform import bbox_overlaps, bbox_transform -from rcnn.config import config - - -def compute_bbox_regression_targets(rois, overlaps, labels): - """ - given rois, overlaps, gt labels, compute bounding box regression targets - :param rois: roidb[i]['boxes'] k * 4 - :param overlaps: roidb[i]['max_overlaps'] k * 1 - :param labels: roidb[i]['max_classes'] k * 1 - :return: targets[i][class, dx, dy, dw, dh] k * 5 - """ - # Ensure ROIs are floats - rois = rois.astype(np.float, copy=False) - - # Sanity check - if len(rois) != len(overlaps): - logger.warning('bbox regression: len(rois) != len(overlaps)') - - # Indices of ground-truth ROIs - gt_inds = np.where(overlaps == 1)[0] - if len(gt_inds) == 0: - logger.warning('bbox regression: len(gt_inds) == 0') - - # Indices of examples for which we try to make predictions - ex_inds = np.where(overlaps >= config.TRAIN.BBOX_REGRESSION_THRESH)[0] - - # Get IoU overlap between each ex ROI and gt ROI - ex_gt_overlaps = bbox_overlaps(rois[ex_inds, :], rois[gt_inds, :]) - - # Find which gt ROI each ex ROI has max overlap with: - # this will be the ex ROI's gt target - gt_assignment = ex_gt_overlaps.argmax(axis=1) - gt_rois = rois[gt_inds[gt_assignment], :] - ex_rois = rois[ex_inds, :] - - targets = np.zeros((rois.shape[0], 5), dtype=np.float32) - targets[ex_inds, 0] = labels[ex_inds] - targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) - return targets - - -def add_bbox_regression_targets(roidb): - """ - given roidb, add ['bbox_targets'] and normalize bounding box regression targets - :param roidb: roidb to be processed. must have gone through imdb.prepare_roidb - :return: means, std variances of targets - """ - logger.info('bbox regression: add bounding box regression targets') - assert len(roidb) > 0 - assert 'max_classes' in roidb[0] - - num_images = len(roidb) - num_classes = roidb[0]['gt_overlaps'].shape[1] - for im_i in range(num_images): - rois = roidb[im_i]['boxes'] - max_overlaps = roidb[im_i]['max_overlaps'] - max_classes = roidb[im_i]['max_classes'] - roidb[im_i]['bbox_targets'] = compute_bbox_regression_targets(rois, max_overlaps, max_classes) - - if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED: - # use fixed / precomputed means and stds instead of empirical values - means = np.tile(np.array(config.TRAIN.BBOX_MEANS), (num_classes, 1)) - stds = np.tile(np.array(config.TRAIN.BBOX_STDS), (num_classes, 1)) - else: - # compute mean, std values - class_counts = np.zeros((num_classes, 1)) + 1e-14 - sums = np.zeros((num_classes, 4)) - squared_sums = np.zeros((num_classes, 4)) - for im_i in range(num_images): - targets = roidb[im_i]['bbox_targets'] - for cls in range(1, num_classes): - cls_indexes = np.where(targets[:, 0] == cls)[0] - if cls_indexes.size > 0: - class_counts[cls] += cls_indexes.size - sums[cls, :] += targets[cls_indexes, 1:].sum(axis=0) - squared_sums[cls, :] += (targets[cls_indexes, 1:] ** 2).sum(axis=0) - - means = sums / class_counts - # var(x) = E(x^2) - E(x)^2 - stds = np.sqrt(squared_sums / class_counts - means ** 2) - - # normalized targets - for im_i in range(num_images): - targets = roidb[im_i]['bbox_targets'] - for cls in range(1, num_classes): - cls_indexes = np.where(targets[:, 0] == cls)[0] - roidb[im_i]['bbox_targets'][cls_indexes, 1:] -= means[cls, :] - roidb[im_i]['bbox_targets'][cls_indexes, 1:] /= stds[cls, :] - - return means.ravel(), stds.ravel() - - -def expand_bbox_regression_targets(bbox_targets_data, num_classes): - """ - expand from 5 to 4 * num_classes; only the right class has non-zero bbox regression targets - :param bbox_targets_data: [k * 5] - :param num_classes: number of classes - :return: bbox target processed [k * 4 num_classes] - bbox_weights ! only foreground boxes have bbox regression computation! - """ - classes = bbox_targets_data[:, 0] - bbox_targets = np.zeros((classes.size, 4 * num_classes), dtype=np.float32) - bbox_weights = np.zeros(bbox_targets.shape, dtype=np.float32) - indexes = np.where(classes > 0)[0] - for index in indexes: - cls = classes[index] - start = int(4 * cls) - end = start + 4 - bbox_targets[index, start:end] = bbox_targets_data[index, 1:] - bbox_weights[index, start:end] = config.TRAIN.BBOX_WEIGHTS - return bbox_targets, bbox_weights - diff --git a/example/rcnn/rcnn/processing/generate_anchor.py b/example/rcnn/rcnn/processing/generate_anchor.py deleted file mode 100644 index 53c280dd45de..000000000000 --- a/example/rcnn/rcnn/processing/generate_anchor.py +++ /dev/null @@ -1,89 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Generate base anchors on index 0 -""" -from builtins import range -import numpy as np - - -def generate_anchors(base_size=16, ratios=[0.5, 1, 2], - scales=2 ** np.arange(3, 6)): - """ - Generate anchor (reference) windows by enumerating aspect ratios X - scales wrt a reference (0, 0, 15, 15) window. - """ - - base_anchor = np.array([1, 1, base_size, base_size]) - 1 - ratio_anchors = _ratio_enum(base_anchor, ratios) - anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) - for i in range(ratio_anchors.shape[0])]) - return anchors - - -def _whctrs(anchor): - """ - Return width, height, x center, and y center for an anchor (window). - """ - - w = anchor[2] - anchor[0] + 1 - h = anchor[3] - anchor[1] + 1 - x_ctr = anchor[0] + 0.5 * (w - 1) - y_ctr = anchor[1] + 0.5 * (h - 1) - return w, h, x_ctr, y_ctr - - -def _mkanchors(ws, hs, x_ctr, y_ctr): - """ - Given a vector of widths (ws) and heights (hs) around a center - (x_ctr, y_ctr), output a set of anchors (windows). - """ - - ws = ws[:, np.newaxis] - hs = hs[:, np.newaxis] - anchors = np.hstack((x_ctr - 0.5 * (ws - 1), - y_ctr - 0.5 * (hs - 1), - x_ctr + 0.5 * (ws - 1), - y_ctr + 0.5 * (hs - 1))) - return anchors - - -def _ratio_enum(anchor, ratios): - """ - Enumerate a set of anchors for each aspect ratio wrt an anchor. - """ - - w, h, x_ctr, y_ctr = _whctrs(anchor) - size = w * h - size_ratios = size / ratios - ws = np.round(np.sqrt(size_ratios)) - hs = np.round(ws * ratios) - anchors = _mkanchors(ws, hs, x_ctr, y_ctr) - return anchors - - -def _scale_enum(anchor, scales): - """ - Enumerate a set of anchors for each scale wrt an anchor. - """ - - w, h, x_ctr, y_ctr = _whctrs(anchor) - ws = w * scales - hs = h * scales - anchors = _mkanchors(ws, hs, x_ctr, y_ctr) - return anchors diff --git a/example/rcnn/rcnn/processing/nms.py b/example/rcnn/rcnn/processing/nms.py deleted file mode 100644 index eca8d58626d3..000000000000 --- a/example/rcnn/rcnn/processing/nms.py +++ /dev/null @@ -1,81 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np -from ..cython.cpu_nms import cpu_nms -try: - from ..cython.gpu_nms import gpu_nms -except ImportError: - gpu_nms = None - - -def py_nms_wrapper(thresh): - def _nms(dets): - return nms(dets, thresh) - return _nms - - -def cpu_nms_wrapper(thresh): - def _nms(dets): - return cpu_nms(dets, thresh) - return _nms - - -def gpu_nms_wrapper(thresh, device_id): - def _nms(dets): - return gpu_nms(dets, thresh, device_id) - if gpu_nms is not None: - return _nms - else: - return cpu_nms_wrapper(thresh) - - -def nms(dets, thresh): - """ - greedily select boxes with high confidence and overlap with current maximum <= thresh - rule out overlap >= thresh - :param dets: [[x1, y1, x2, y2 score]] - :param thresh: retain overlap < thresh - :return: indexes to keep - """ - x1 = dets[:, 0] - y1 = dets[:, 1] - x2 = dets[:, 2] - y2 = dets[:, 3] - scores = dets[:, 4] - - areas = (x2 - x1 + 1) * (y2 - y1 + 1) - order = scores.argsort()[::-1] - - keep = [] - while order.size > 0: - i = order[0] - keep.append(i) - xx1 = np.maximum(x1[i], x1[order[1:]]) - yy1 = np.maximum(y1[i], y1[order[1:]]) - xx2 = np.minimum(x2[i], x2[order[1:]]) - yy2 = np.minimum(y2[i], y2[order[1:]]) - - w = np.maximum(0.0, xx2 - xx1 + 1) - h = np.maximum(0.0, yy2 - yy1 + 1) - inter = w * h - ovr = inter / (areas[i] + areas[order[1:]] - inter) - - inds = np.where(ovr <= thresh)[0] - order = order[inds + 1] - - return keep diff --git a/example/rcnn/rcnn/pycocotools/UPSTREAM_REV b/example/rcnn/rcnn/pycocotools/UPSTREAM_REV deleted file mode 100644 index 9613b145b237..000000000000 --- a/example/rcnn/rcnn/pycocotools/UPSTREAM_REV +++ /dev/null @@ -1 +0,0 @@ -https://github.com/pdollar/coco/commit/336d2a27c91e3c0663d2dcf0b13574674d30f88e diff --git a/example/rcnn/rcnn/pycocotools/__init__.py b/example/rcnn/rcnn/pycocotools/__init__.py deleted file mode 100644 index 2f4e0d430df9..000000000000 --- a/example/rcnn/rcnn/pycocotools/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -__author__ = 'tylin' diff --git a/example/rcnn/rcnn/pycocotools/_mask.pyx b/example/rcnn/rcnn/pycocotools/_mask.pyx deleted file mode 100644 index 1c3e127a1c05..000000000000 --- a/example/rcnn/rcnn/pycocotools/_mask.pyx +++ /dev/null @@ -1,308 +0,0 @@ -# distutils: language = c -# distutils: sources = maskApi.c - -#************************************************************************** -# Microsoft COCO Toolbox. version 2.0 -# Data, paper, and tutorials available at: http://mscoco.org/ -# Code written by Piotr Dollar and Tsung-Yi Lin, 2015. -# Licensed under the Simplified BSD License [see coco/license.txt] -#************************************************************************** - -__author__ = 'tsungyi' - -import sys -PYTHON_VERSION = sys.version_info[0] - -# import both Python-level and C-level symbols of Numpy -# the API uses Numpy to interface C and Python -import numpy as np -cimport numpy as np -from libc.stdlib cimport malloc, free - -# intialized Numpy. must do. -np.import_array() - -# import numpy C function -# we use PyArray_ENABLEFLAGS to make Numpy ndarray responsible to memoery management -cdef extern from "numpy/arrayobject.h": - void PyArray_ENABLEFLAGS(np.ndarray arr, int flags) - -# Declare the prototype of the C functions in MaskApi.h -cdef extern from "maskApi.h": - ctypedef unsigned int uint - ctypedef unsigned long siz - ctypedef unsigned char byte - ctypedef double* BB - ctypedef struct RLE: - siz h, - siz w, - siz m, - uint* cnts, - void rlesInit( RLE **R, siz n ) - void rleEncode( RLE *R, const byte *M, siz h, siz w, siz n ) - void rleDecode( const RLE *R, byte *mask, siz n ) - void rleMerge( const RLE *R, RLE *M, siz n, int intersect ) - void rleArea( const RLE *R, siz n, uint *a ) - void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ) - void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) - void rleToBbox( const RLE *R, BB bb, siz n ) - void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ) - void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ) - char* rleToString( const RLE *R ) - void rleFrString( RLE *R, char *s, siz h, siz w ) - -# python class to wrap RLE array in C -# the class handles the memory allocation and deallocation -cdef class RLEs: - cdef RLE *_R - cdef siz _n - - def __cinit__(self, siz n =0): - rlesInit(&self._R, n) - self._n = n - - # free the RLE array here - def __dealloc__(self): - if self._R is not NULL: - for i in range(self._n): - free(self._R[i].cnts) - free(self._R) - def __getattr__(self, key): - if key == 'n': - return self._n - raise AttributeError(key) - -# python class to wrap Mask array in C -# the class handles the memory allocation and deallocation -cdef class Masks: - cdef byte *_mask - cdef siz _h - cdef siz _w - cdef siz _n - - def __cinit__(self, h, w, n): - self._mask = malloc(h*w*n* sizeof(byte)) - self._h = h - self._w = w - self._n = n - # def __dealloc__(self): - # the memory management of _mask has been passed to np.ndarray - # it doesn't need to be freed here - - # called when passing into np.array() and return an np.ndarray in column-major order - def __array__(self): - cdef np.npy_intp shape[1] - shape[0] = self._h*self._w*self._n - # Create a 1D array, and reshape it to fortran/Matlab column-major array - ndarray = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT8, self._mask).reshape((self._h, self._w, self._n), order='F') - # The _mask allocated by Masks is now handled by ndarray - PyArray_ENABLEFLAGS(ndarray, np.NPY_OWNDATA) - return ndarray - -# internal conversion from Python RLEs object to compressed RLE format -def _toString(RLEs Rs): - cdef siz n = Rs.n - cdef bytes py_string - cdef char* c_string - objs = [] - for i in range(n): - c_string = rleToString( &Rs._R[i] ) - py_string = c_string - objs.append({ - 'size': [Rs._R[i].h, Rs._R[i].w], - 'counts': py_string - }) - free(c_string) - return objs - -# internal conversion from compressed RLE format to Python RLEs object -def _frString(rleObjs): - cdef siz n = len(rleObjs) - Rs = RLEs(n) - cdef bytes py_string - cdef char* c_string - for i, obj in enumerate(rleObjs): - if PYTHON_VERSION == 2: - py_string = str(obj['counts']).encode('utf8') - elif PYTHON_VERSION == 3: - py_string = str.encode(obj['counts']) if type(obj['counts']) == str else obj['counts'] - else: - raise Exception('Python version must be 2 or 3') - c_string = py_string - rleFrString( &Rs._R[i], c_string, obj['size'][0], obj['size'][1] ) - return Rs - -# encode mask to RLEs objects -# list of RLE string can be generated by RLEs member function -def encode(np.ndarray[np.uint8_t, ndim=3, mode='fortran'] mask): - h, w, n = mask.shape[0], mask.shape[1], mask.shape[2] - cdef RLEs Rs = RLEs(n) - rleEncode(Rs._R,mask.data,h,w,n) - objs = _toString(Rs) - return objs - -# decode mask from compressed list of RLE string or RLEs object -def decode(rleObjs): - cdef RLEs Rs = _frString(rleObjs) - h, w, n = Rs._R[0].h, Rs._R[0].w, Rs._n - masks = Masks(h, w, n) - rleDecode(Rs._R, masks._mask, n); - return np.array(masks) - -def merge(rleObjs, intersect=0): - cdef RLEs Rs = _frString(rleObjs) - cdef RLEs R = RLEs(1) - rleMerge(Rs._R, R._R, Rs._n, intersect) - obj = _toString(R)[0] - return obj - -def area(rleObjs): - cdef RLEs Rs = _frString(rleObjs) - cdef uint* _a = malloc(Rs._n* sizeof(uint)) - rleArea(Rs._R, Rs._n, _a) - cdef np.npy_intp shape[1] - shape[0] = Rs._n - a = np.array((Rs._n, ), dtype=np.uint8) - a = np.PyArray_SimpleNewFromData(1, shape, np.NPY_UINT32, _a) - PyArray_ENABLEFLAGS(a, np.NPY_OWNDATA) - return a - -# iou computation. support function overload (RLEs-RLEs and bbox-bbox). -def iou( dt, gt, pyiscrowd ): - def _preproc(objs): - if len(objs) == 0: - return objs - if type(objs) == np.ndarray: - if len(objs.shape) == 1: - objs = objs.reshape((objs[0], 1)) - # check if it's Nx4 bbox - if not len(objs.shape) == 2 or not objs.shape[1] == 4: - raise Exception('numpy ndarray input is only for *bounding boxes* and should have Nx4 dimension') - objs = objs.astype(np.double) - elif type(objs) == list: - # check if list is in box format and convert it to np.ndarray - isbox = np.all(np.array([(len(obj)==4) and ((type(obj)==list) or (type(obj)==np.ndarray)) for obj in objs])) - isrle = np.all(np.array([type(obj) == dict for obj in objs])) - if isbox: - objs = np.array(objs, dtype=np.double) - if len(objs.shape) == 1: - objs = objs.reshape((1,objs.shape[0])) - elif isrle: - objs = _frString(objs) - else: - raise Exception('list input can be bounding box (Nx4) or RLEs ([RLE])') - else: - raise Exception('unrecognized type. The following type: RLEs (rle), np.ndarray (box), and list (box) are supported.') - return objs - def _rleIou(RLEs dt, RLEs gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): - rleIou( dt._R, gt._R, m, n, iscrowd.data, _iou.data ) - def _bbIou(np.ndarray[np.double_t, ndim=2] dt, np.ndarray[np.double_t, ndim=2] gt, np.ndarray[np.uint8_t, ndim=1] iscrowd, siz m, siz n, np.ndarray[np.double_t, ndim=1] _iou): - bbIou( dt.data, gt.data, m, n, iscrowd.data, _iou.data ) - def _len(obj): - cdef siz N = 0 - if type(obj) == RLEs: - N = obj.n - elif len(obj)==0: - pass - elif type(obj) == np.ndarray: - N = obj.shape[0] - return N - # convert iscrowd to numpy array - cdef np.ndarray[np.uint8_t, ndim=1] iscrowd = np.array(pyiscrowd, dtype=np.uint8) - # simple type checking - cdef siz m, n - dt = _preproc(dt) - gt = _preproc(gt) - m = _len(dt) - n = _len(gt) - if m == 0 or n == 0: - return [] - if not type(dt) == type(gt): - raise Exception('The dt and gt should have the same data type, either RLEs, list or np.ndarray') - - # define local variables - cdef double* _iou = 0 - cdef np.npy_intp shape[1] - # check type and assign iou function - if type(dt) == RLEs: - _iouFun = _rleIou - elif type(dt) == np.ndarray: - _iouFun = _bbIou - else: - raise Exception('input data type not allowed.') - _iou = malloc(m*n* sizeof(double)) - iou = np.zeros((m*n, ), dtype=np.double) - shape[0] = m*n - iou = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _iou) - PyArray_ENABLEFLAGS(iou, np.NPY_OWNDATA) - _iouFun(dt, gt, iscrowd, m, n, iou) - return iou.reshape((m,n), order='F') - -def toBbox( rleObjs ): - cdef RLEs Rs = _frString(rleObjs) - cdef siz n = Rs.n - cdef BB _bb = malloc(4*n* sizeof(double)) - rleToBbox( Rs._R, _bb, n ) - cdef np.npy_intp shape[1] - shape[0] = 4*n - bb = np.array((1,4*n), dtype=np.double) - bb = np.PyArray_SimpleNewFromData(1, shape, np.NPY_DOUBLE, _bb).reshape((n, 4)) - PyArray_ENABLEFLAGS(bb, np.NPY_OWNDATA) - return bb - -def frBbox(np.ndarray[np.double_t, ndim=2] bb, siz h, siz w ): - cdef siz n = bb.shape[0] - Rs = RLEs(n) - rleFrBbox( Rs._R, bb.data, h, w, n ) - objs = _toString(Rs) - return objs - -def frPoly( poly, siz h, siz w ): - cdef np.ndarray[np.double_t, ndim=1] np_poly - n = len(poly) - Rs = RLEs(n) - for i, p in enumerate(poly): - np_poly = np.array(p, dtype=np.double, order='F') - rleFrPoly( &Rs._R[i], np_poly.data, int(len(p)/2), h, w ) - objs = _toString(Rs) - return objs - -def frUncompressedRLE(ucRles, siz h, siz w): - cdef np.ndarray[np.uint32_t, ndim=1] cnts - cdef RLE R - cdef uint *data - n = len(ucRles) - objs = [] - for i in range(n): - Rs = RLEs(1) - cnts = np.array(ucRles[i]['counts'], dtype=np.uint32) - # time for malloc can be saved here but it's fine - data = malloc(len(cnts)* sizeof(uint)) - for j in range(len(cnts)): - data[j] = cnts[j] - R = RLE(ucRles[i]['size'][0], ucRles[i]['size'][1], len(cnts), data) - Rs._R[0] = R - objs.append(_toString(Rs)[0]) - return objs - -def frPyObjects(pyobj, h, w): - # encode rle from a list of python objects - if type(pyobj) == np.ndarray: - objs = frBbox(pyobj, h, w) - elif type(pyobj) == list and len(pyobj[0]) == 4: - objs = frBbox(pyobj, h, w) - elif type(pyobj) == list and len(pyobj[0]) > 4: - objs = frPoly(pyobj, h, w) - elif type(pyobj) == list and type(pyobj[0]) == dict \ - and 'counts' in pyobj[0] and 'size' in pyobj[0]: - objs = frUncompressedRLE(pyobj, h, w) - # encode rle from single python object - elif type(pyobj) == list and len(pyobj) == 4: - objs = frBbox([pyobj], h, w)[0] - elif type(pyobj) == list and len(pyobj) > 4: - objs = frPoly([pyobj], h, w)[0] - elif type(pyobj) == dict and 'counts' in pyobj and 'size' in pyobj: - objs = frUncompressedRLE([pyobj], h, w)[0] - else: - raise Exception('input type is not supported.') - return objs diff --git a/example/rcnn/rcnn/pycocotools/coco.py b/example/rcnn/rcnn/pycocotools/coco.py deleted file mode 100644 index 5cc835a05633..000000000000 --- a/example/rcnn/rcnn/pycocotools/coco.py +++ /dev/null @@ -1,445 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -__author__ = 'tylin' -__version__ = '2.0' -# Interface for accessing the Microsoft COCO dataset. - -# Microsoft COCO is a large image dataset designed for object detection, -# segmentation, and caption generation. pycocotools is a Python API that -# assists in loading, parsing and visualizing the annotations in COCO. -# Please visit http://mscoco.org/ for more information on COCO, including -# for the data, paper, and tutorials. The exact format of the annotations -# is also described on the COCO website. For example usage of the pycocotools -# please see pycocotools_demo.ipynb. In addition to this API, please download both -# the COCO images and annotations in order to run the demo. - -# An alternative to using the API is to load the annotations directly -# into Python dictionary -# Using the API provides additional utility functions. Note that this API -# supports both *instance* and *caption* annotations. In the case of -# captions not all functions are defined (e.g. categories are undefined). - -# The following API functions are defined: -# COCO - COCO api class that loads COCO annotation file and prepare data structures. -# decodeMask - Decode binary mask M encoded via run-length encoding. -# encodeMask - Encode binary mask M using run-length encoding. -# getAnnIds - Get ann ids that satisfy given filter conditions. -# getCatIds - Get cat ids that satisfy given filter conditions. -# getImgIds - Get img ids that satisfy given filter conditions. -# loadAnns - Load anns with the specified ids. -# loadCats - Load cats with the specified ids. -# loadImgs - Load imgs with the specified ids. -# annToMask - Convert segmentation in an annotation to binary mask. -# showAnns - Display the specified annotations. -# loadRes - Load algorithm results and create API for accessing them. -# download - Download COCO images from mscoco.org server. -# Throughout the API "ann"=annotation, "cat"=category, and "img"=image. -# Help on each functions can be accessed by: "help COCO>function". - -# See also COCO>decodeMask, -# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds, -# COCO>getImgIds, COCO>loadAnns, COCO>loadCats, -# COCO>loadImgs, COCO>annToMask, COCO>showAnns - -# Microsoft COCO Toolbox. version 2.0 -# Data, paper, and tutorials available at: http://mscoco.org/ -# Code written by Piotr Dollar and Tsung-Yi Lin, 2014. -# Licensed under the Simplified BSD License [see bsd.txt] - -import json -import time -import matplotlib.pyplot as plt -from matplotlib.collections import PatchCollection -from matplotlib.patches import Polygon -import numpy as np -import copy -import itertools -from . import mask as maskUtils -import os -from collections import defaultdict -import sys -PYTHON_VERSION = sys.version_info[0] -if PYTHON_VERSION == 2: - from urllib import urlretrieve -elif PYTHON_VERSION == 3: - from urllib.request import urlretrieve - -class COCO: - def __init__(self, annotation_file=None): - """ - Constructor of Microsoft COCO helper class for reading and visualizing annotations. - :param annotation_file (str): location of annotation file - :param image_folder (str): location to the folder that hosts images. - :return: - """ - # load dataset - self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict() - self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list) - if not annotation_file == None: - print('loading annotations into memory...') - tic = time.time() - dataset = json.load(open(annotation_file, 'r')) - assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset)) - print('Done (t={:0.2f}s)'.format(time.time()- tic)) - self.dataset = dataset - self.createIndex() - - def createIndex(self): - # create index - print('creating index...') - anns, cats, imgs = {}, {}, {} - imgToAnns,catToImgs = defaultdict(list),defaultdict(list) - if 'annotations' in self.dataset: - for ann in self.dataset['annotations']: - imgToAnns[ann['image_id']].append(ann) - anns[ann['id']] = ann - - if 'images' in self.dataset: - for img in self.dataset['images']: - imgs[img['id']] = img - - if 'categories' in self.dataset: - for cat in self.dataset['categories']: - cats[cat['id']] = cat - - if 'annotations' in self.dataset and 'categories' in self.dataset: - for ann in self.dataset['annotations']: - catToImgs[ann['category_id']].append(ann['image_id']) - - print('index created!') - - # create class members - self.anns = anns - self.imgToAnns = imgToAnns - self.catToImgs = catToImgs - self.imgs = imgs - self.cats = cats - - def info(self): - """ - Print information about the annotation file. - :return: - """ - for key, value in self.dataset['info'].items(): - print('{}: {}'.format(key, value)) - - def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): - """ - Get ann ids that satisfy given filter conditions. default skips that filter - :param imgIds (int array) : get anns for given imgs - catIds (int array) : get anns for given cats - areaRng (float array) : get anns for given area range (e.g. [0 inf]) - iscrowd (boolean) : get anns for given crowd label (False or True) - :return: ids (int array) : integer array of ann ids - """ - imgIds = imgIds if type(imgIds) == list else [imgIds] - catIds = catIds if type(catIds) == list else [catIds] - - if len(imgIds) == len(catIds) == len(areaRng) == 0: - anns = self.dataset['annotations'] - else: - if not len(imgIds) == 0: - lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns] - anns = list(itertools.chain.from_iterable(lists)) - else: - anns = self.dataset['annotations'] - anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] - anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]] - if not iscrowd == None: - ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] - else: - ids = [ann['id'] for ann in anns] - return ids - - def getCatIds(self, catNms=[], supNms=[], catIds=[]): - """ - filtering parameters. default skips that filter. - :param catNms (str array) : get cats for given cat names - :param supNms (str array) : get cats for given supercategory names - :param catIds (int array) : get cats for given cat ids - :return: ids (int array) : integer array of cat ids - """ - catNms = catNms if type(catNms) == list else [catNms] - supNms = supNms if type(supNms) == list else [supNms] - catIds = catIds if type(catIds) == list else [catIds] - - if len(catNms) == len(supNms) == len(catIds) == 0: - cats = self.dataset['categories'] - else: - cats = self.dataset['categories'] - cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] - cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] - cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] - ids = [cat['id'] for cat in cats] - return ids - - def getImgIds(self, imgIds=[], catIds=[]): - ''' - Get img ids that satisfy given filter conditions. - :param imgIds (int array) : get imgs for given ids - :param catIds (int array) : get imgs with all given cats - :return: ids (int array) : integer array of img ids - ''' - imgIds = imgIds if type(imgIds) == list else [imgIds] - catIds = catIds if type(catIds) == list else [catIds] - - if len(imgIds) == len(catIds) == 0: - ids = self.imgs.keys() - else: - ids = set(imgIds) - for i, catId in enumerate(catIds): - if i == 0 and len(ids) == 0: - ids = set(self.catToImgs[catId]) - else: - ids &= set(self.catToImgs[catId]) - return list(ids) - - def loadAnns(self, ids=[]): - """ - Load anns with the specified ids. - :param ids (int array) : integer ids specifying anns - :return: anns (object array) : loaded ann objects - """ - if type(ids) == list: - return [self.anns[id] for id in ids] - elif type(ids) == int: - return [self.anns[ids]] - - def loadCats(self, ids=[]): - """ - Load cats with the specified ids. - :param ids (int array) : integer ids specifying cats - :return: cats (object array) : loaded cat objects - """ - if type(ids) == list: - return [self.cats[id] for id in ids] - elif type(ids) == int: - return [self.cats[ids]] - - def loadImgs(self, ids=[]): - """ - Load anns with the specified ids. - :param ids (int array) : integer ids specifying img - :return: imgs (object array) : loaded img objects - """ - if type(ids) == list: - return [self.imgs[id] for id in ids] - elif type(ids) == int: - return [self.imgs[ids]] - - def showAnns(self, anns): - """ - Display the specified annotations. - :param anns (array of object): annotations to display - :return: None - """ - if len(anns) == 0: - return 0 - if 'segmentation' in anns[0] or 'keypoints' in anns[0]: - datasetType = 'instances' - elif 'caption' in anns[0]: - datasetType = 'captions' - else: - raise Exception('datasetType not supported') - if datasetType == 'instances': - ax = plt.gca() - ax.set_autoscale_on(False) - polygons = [] - color = [] - for ann in anns: - c = (np.random.random((1, 3))*0.6+0.4).tolist()[0] - if 'segmentation' in ann: - if type(ann['segmentation']) == list: - # polygon - for seg in ann['segmentation']: - poly = np.array(seg).reshape((int(len(seg)/2), 2)) - polygons.append(Polygon(poly)) - color.append(c) - else: - # mask - t = self.imgs[ann['image_id']] - if type(ann['segmentation']['counts']) == list: - rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width']) - else: - rle = [ann['segmentation']] - m = maskUtils.decode(rle) - img = np.ones( (m.shape[0], m.shape[1], 3) ) - if ann['iscrowd'] == 1: - color_mask = np.array([2.0,166.0,101.0])/255 - if ann['iscrowd'] == 0: - color_mask = np.random.random((1, 3)).tolist()[0] - for i in range(3): - img[:,:,i] = color_mask[i] - ax.imshow(np.dstack( (img, m*0.5) )) - if 'keypoints' in ann and type(ann['keypoints']) == list: - # turn skeleton into zero-based index - sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1 - kp = np.array(ann['keypoints']) - x = kp[0::3] - y = kp[1::3] - v = kp[2::3] - for sk in sks: - if np.all(v[sk]>0): - plt.plot(x[sk],y[sk], linewidth=3, color=c) - plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2) - plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2) - p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4) - ax.add_collection(p) - p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2) - ax.add_collection(p) - elif datasetType == 'captions': - for ann in anns: - print(ann['caption']) - - def loadRes(self, resFile): - """ - Load result file and return a result api object. - :param resFile (str) : file name of result file - :return: res (obj) : result api object - """ - res = COCO() - res.dataset['images'] = [img for img in self.dataset['images']] - - print('Loading and preparing results...') - tic = time.time() - if type(resFile) == str or type(resFile) == unicode: - anns = json.load(open(resFile)) - elif type(resFile) == np.ndarray: - anns = self.loadNumpyAnnotations(resFile) - else: - anns = resFile - assert type(anns) == list, 'results in not an array of objects' - annsImgIds = [ann['image_id'] for ann in anns] - assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ - 'Results do not correspond to current coco set' - if 'caption' in anns[0]: - imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) - res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] - for id, ann in enumerate(anns): - ann['id'] = id+1 - elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: - res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) - for id, ann in enumerate(anns): - bb = ann['bbox'] - x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]] - if not 'segmentation' in ann: - ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] - ann['area'] = bb[2]*bb[3] - ann['id'] = id+1 - ann['iscrowd'] = 0 - elif 'segmentation' in anns[0]: - res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) - for id, ann in enumerate(anns): - # now only support compressed RLE format as segmentation results - ann['area'] = maskUtils.area(ann['segmentation']) - if not 'bbox' in ann: - ann['bbox'] = maskUtils.toBbox(ann['segmentation']) - ann['id'] = id+1 - ann['iscrowd'] = 0 - elif 'keypoints' in anns[0]: - res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) - for id, ann in enumerate(anns): - s = ann['keypoints'] - x = s[0::3] - y = s[1::3] - x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y) - ann['area'] = (x1-x0)*(y1-y0) - ann['id'] = id + 1 - ann['bbox'] = [x0,y0,x1-x0,y1-y0] - print('DONE (t={:0.2f}s)'.format(time.time()- tic)) - - res.dataset['annotations'] = anns - res.createIndex() - return res - - def download(self, tarDir = None, imgIds = [] ): - ''' - Download COCO images from mscoco.org server. - :param tarDir (str): COCO results directory name - imgIds (list): images to be downloaded - :return: - ''' - if tarDir is None: - print('Please specify target directory') - return -1 - if len(imgIds) == 0: - imgs = self.imgs.values() - else: - imgs = self.loadImgs(imgIds) - N = len(imgs) - if not os.path.exists(tarDir): - os.makedirs(tarDir) - for i, img in enumerate(imgs): - tic = time.time() - fname = os.path.join(tarDir, img['file_name']) - if not os.path.exists(fname): - urlretrieve(img['coco_url'], fname) - print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic)) - - def loadNumpyAnnotations(self, data): - """ - Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class} - :param data (numpy.ndarray) - :return: annotations (python nested list) - """ - print('Converting ndarray to lists...') - assert(type(data) == np.ndarray) - print(data.shape) - assert(data.shape[1] == 7) - N = data.shape[0] - ann = [] - for i in range(N): - if i % 1000000 == 0: - print('{}/{}'.format(i,N)) - ann += [{ - 'image_id' : int(data[i, 0]), - 'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ], - 'score' : data[i, 5], - 'category_id': int(data[i, 6]), - }] - return ann - - def annToRLE(self, ann): - """ - Convert annotation which can be polygons, uncompressed RLE to RLE. - :return: binary mask (numpy 2D array) - """ - t = self.imgs[ann['image_id']] - h, w = t['height'], t['width'] - segm = ann['segmentation'] - if type(segm) == list: - # polygon -- a single object might consist of multiple parts - # we merge all parts into one mask rle code - rles = maskUtils.frPyObjects(segm, h, w) - rle = maskUtils.merge(rles) - elif type(segm['counts']) == list: - # uncompressed RLE - rle = maskUtils.frPyObjects(segm, h, w) - else: - # rle - rle = ann['segmentation'] - return rle - - def annToMask(self, ann): - """ - Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask. - :return: binary mask (numpy 2D array) - """ - rle = self.annToRLE(ann) - m = maskUtils.decode(rle) - return m diff --git a/example/rcnn/rcnn/pycocotools/cocoeval.py b/example/rcnn/rcnn/pycocotools/cocoeval.py deleted file mode 100644 index e1d181b5bc32..000000000000 --- a/example/rcnn/rcnn/pycocotools/cocoeval.py +++ /dev/null @@ -1,545 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -__author__ = 'tsungyi' - -import numpy as np -import datetime -import time -from collections import defaultdict -from .mask import * -import copy - -class COCOeval: - # Interface for evaluating detection on the Microsoft COCO dataset. - # - # The usage for CocoEval is as follows: - # cocoGt=..., cocoDt=... # load dataset and results - # E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object - # E.params.recThrs = ...; # set parameters as desired - # E.evaluate(); # run per image evaluation - # E.accumulate(); # accumulate per image results - # E.summarize(); # display summary metrics of results - # For example usage see evalDemo.m and http://mscoco.org/. - # - # The evaluation parameters are as follows (defaults in brackets): - # imgIds - [all] N img ids to use for evaluation - # catIds - [all] K cat ids to use for evaluation - # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation - # recThrs - [0:.01:1] R=101 recall thresholds for evaluation - # areaRng - [...] A=4 object area ranges for evaluation - # maxDets - [1 10 100] M=3 thresholds on max detections per image - # iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints' - # iouType replaced the now DEPRECATED useSegm parameter. - # useCats - [1] if true use category labels for evaluation - # Note: if useCats=0 category labels are ignored as in proposal scoring. - # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified. - # - # evaluate(): evaluates detections on every image and every category and - # concats the results into the "evalImgs" with fields: - # dtIds - [1xD] id for each of the D detections (dt) - # gtIds - [1xG] id for each of the G ground truths (gt) - # dtMatches - [TxD] matching gt id at each IoU or 0 - # gtMatches - [TxG] matching dt id at each IoU or 0 - # dtScores - [1xD] confidence of each dt - # gtIgnore - [1xG] ignore flag for each gt - # dtIgnore - [TxD] ignore flag for each dt at each IoU - # - # accumulate(): accumulates the per-image, per-category evaluation - # results in "evalImgs" into the dictionary "eval" with fields: - # params - parameters used for evaluation - # date - date evaluation was performed - # counts - [T,R,K,A,M] parameter dimensions (see above) - # precision - [TxRxKxAxM] precision for every evaluation setting - # recall - [TxKxAxM] max recall for every evaluation setting - # Note: precision and recall==-1 for settings with no gt objects. - # - # See also coco, mask, pycocoDemo, pycocoEvalDemo - # - # Microsoft COCO Toolbox. version 2.0 - # Data, paper, and tutorials available at: http://mscoco.org/ - # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. - # Licensed under the Simplified BSD License [see coco/license.txt] - def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'): - ''' - Initialize CocoEval using coco APIs for gt and dt - :param cocoGt: coco object with ground truth annotations - :param cocoDt: coco object with detection results - :return: None - ''' - if not iouType: - print('iouType not specified. use default iouType segm') - self.cocoGt = cocoGt # ground truth COCO API - self.cocoDt = cocoDt # detections COCO API - self.params = {} # evaluation parameters - self.evalImgs = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements - self.eval = {} # accumulated evaluation results - self._gts = defaultdict(list) # gt for evaluation - self._dts = defaultdict(list) # dt for evaluation - self.params = Params(iouType=iouType) # parameters - self._paramsEval = {} # parameters for evaluation - self.stats = [] # result summarization - self.ious = {} # ious between all gts and dts - if not cocoGt is None: - self.params.imgIds = sorted(cocoGt.getImgIds()) - self.params.catIds = sorted(cocoGt.getCatIds()) - - - def _prepare(self): - ''' - Prepare ._gts and ._dts for evaluation based on params - :return: None - ''' - def _toMask(anns, coco): - # modify ann['segmentation'] by reference - for ann in anns: - rle = coco.annToRLE(ann) - ann['segmentation'] = rle - p = self.params - if p.useCats: - gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) - dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) - else: - gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds)) - dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds)) - - # convert ground truth to mask if iouType == 'segm' - if p.iouType == 'segm': - _toMask(gts, self.cocoGt) - _toMask(dts, self.cocoDt) - # set ignore flag - for gt in gts: - gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0 - gt['ignore'] = 'iscrowd' in gt and gt['iscrowd'] - if p.iouType == 'keypoints': - gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore'] - self._gts = defaultdict(list) # gt for evaluation - self._dts = defaultdict(list) # dt for evaluation - for gt in gts: - self._gts[gt['image_id'], gt['category_id']].append(gt) - for dt in dts: - self._dts[dt['image_id'], dt['category_id']].append(dt) - self.evalImgs = defaultdict(list) # per-image per-category evaluation results - self.eval = {} # accumulated evaluation results - - def evaluate(self): - ''' - Run per image evaluation on given images and store results (a list of dict) in self.evalImgs - :return: None - ''' - tic = time.time() - print('Running per image evaluation...') - p = self.params - # add backward compatibility if useSegm is specified in params - if not p.useSegm is None: - p.iouType = 'segm' if p.useSegm == 1 else 'bbox' - print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) - print('Evaluate annotation type *{}*'.format(p.iouType)) - p.imgIds = list(np.unique(p.imgIds)) - if p.useCats: - p.catIds = list(np.unique(p.catIds)) - p.maxDets = sorted(p.maxDets) - self.params=p - - self._prepare() - # loop through images, area range, max detection number - catIds = p.catIds if p.useCats else [-1] - - if p.iouType == 'segm' or p.iouType == 'bbox': - computeIoU = self.computeIoU - elif p.iouType == 'keypoints': - computeIoU = self.computeOks - self.ious = {(imgId, catId): computeIoU(imgId, catId) \ - for imgId in p.imgIds - for catId in catIds} - - evaluateImg = self.evaluateImg - maxDet = p.maxDets[-1] - self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet) - for catId in catIds - for areaRng in p.areaRng - for imgId in p.imgIds - ] - self._paramsEval = copy.deepcopy(self.params) - toc = time.time() - print('DONE (t={:0.2f}s).'.format(toc-tic)) - - def computeIoU(self, imgId, catId): - p = self.params - if p.useCats: - gt = self._gts[imgId,catId] - dt = self._dts[imgId,catId] - else: - gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]] - dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]] - if len(gt) == 0 and len(dt) ==0: - return [] - inds = np.argsort([-d['score'] for d in dt], kind='mergesort') - dt = [dt[i] for i in inds] - if len(dt) > p.maxDets[-1]: - dt=dt[0:p.maxDets[-1]] - - if p.iouType == 'segm': - g = [g['segmentation'] for g in gt] - d = [d['segmentation'] for d in dt] - elif p.iouType == 'bbox': - g = [g['bbox'] for g in gt] - d = [d['bbox'] for d in dt] - else: - raise Exception('unknown iouType for iou computation') - - # compute iou between each dt and gt region - iscrowd = [int(o['iscrowd']) for o in gt] - ious = iou(d,g,iscrowd) - return ious - - def computeOks(self, imgId, catId): - p = self.params - # dimention here should be Nxm - gts = self._gts[imgId, catId] - dts = self._dts[imgId, catId] - inds = np.argsort([-d['score'] for d in dts], kind='mergesort') - dts = [dts[i] for i in inds] - if len(dts) > p.maxDets[-1]: - dts = dts[0:p.maxDets[-1]] - # if len(gts) == 0 and len(dts) == 0: - if len(gts) == 0 or len(dts) == 0: - return [] - ious = np.zeros((len(dts), len(gts))) - sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0 - vars = (sigmas * 2)**2 - k = len(sigmas) - # compute oks between each detection and ground truth object - for j, gt in enumerate(gts): - # create bounds for ignore regions(double the gt bbox) - g = np.array(gt['keypoints']) - xg = g[0::3]; yg = g[1::3]; vg = g[2::3] - k1 = np.count_nonzero(vg > 0) - bb = gt['bbox'] - x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2 - y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2 - for i, dt in enumerate(dts): - d = np.array(dt['keypoints']) - xd = d[0::3]; yd = d[1::3] - if k1>0: - # measure the per-keypoint distance if keypoints visible - dx = xd - xg - dy = yd - yg - else: - # measure minimum distance to keypoints in (x0,y0) & (x1,y1) - z = np.zeros((k)) - dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0) - dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0) - e = (dx**2 + dy**2) / vars / (gt['area']+np.spacing(1)) / 2 - if k1 > 0: - e=e[vg > 0] - ious[i, j] = np.sum(np.exp(-e)) / e.shape[0] - return ious - - def evaluateImg(self, imgId, catId, aRng, maxDet): - ''' - perform evaluation for single category and image - :return: dict (single image results) - ''' - p = self.params - if p.useCats: - gt = self._gts[imgId,catId] - dt = self._dts[imgId,catId] - else: - gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]] - dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]] - if len(gt) == 0 and len(dt) ==0: - return None - - for g in gt: - if g['ignore'] or (g['area']aRng[1]): - g['_ignore'] = 1 - else: - g['_ignore'] = 0 - - # sort dt highest score first, sort gt ignore last - gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort') - gt = [gt[i] for i in gtind] - dtind = np.argsort([-d['score'] for d in dt], kind='mergesort') - dt = [dt[i] for i in dtind[0:maxDet]] - iscrowd = [int(o['iscrowd']) for o in gt] - # load computed ious - ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId] - - T = len(p.iouThrs) - G = len(gt) - D = len(dt) - gtm = np.zeros((T,G)) - dtm = np.zeros((T,D)) - gtIg = np.array([g['_ignore'] for g in gt]) - dtIg = np.zeros((T,D)) - if not len(ious)==0: - for tind, t in enumerate(p.iouThrs): - for dind, d in enumerate(dt): - # information about best match so far (m=-1 -> unmatched) - iou = min([t,1-1e-10]) - m = -1 - for gind, g in enumerate(gt): - # if this gt already matched, and not a crowd, continue - if gtm[tind,gind]>0 and not iscrowd[gind]: - continue - # if dt matched to reg gt, and on ignore gt, stop - if m>-1 and gtIg[m]==0 and gtIg[gind]==1: - break - # continue to next gt unless better match made - if ious[dind,gind] < iou: - continue - # if match successful and best so far, store appropriately - iou=ious[dind,gind] - m=gind - # if match made store id of match for both dt and gt - if m ==-1: - continue - dtIg[tind,dind] = gtIg[m] - dtm[tind,dind] = gt[m]['id'] - gtm[tind,m] = d['id'] - # set unmatched detections outside of area range to ignore - a = np.array([d['area']aRng[1] for d in dt]).reshape((1, len(dt))) - dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0))) - # store results for given image and category - return { - 'image_id': imgId, - 'category_id': catId, - 'aRng': aRng, - 'maxDet': maxDet, - 'dtIds': [d['id'] for d in dt], - 'gtIds': [g['id'] for g in gt], - 'dtMatches': dtm, - 'gtMatches': gtm, - 'dtScores': [d['score'] for d in dt], - 'gtIgnore': gtIg, - 'dtIgnore': dtIg, - } - - def accumulate(self, p = None): - ''' - Accumulate per image evaluation results and store the result in self.eval - :param p: input params for evaluation - :return: None - ''' - print('Accumulating evaluation results...') - tic = time.time() - if not self.evalImgs: - print('Please run evaluate() first') - # allows input customized parameters - if p is None: - p = self.params - p.catIds = p.catIds if p.useCats == 1 else [-1] - T = len(p.iouThrs) - R = len(p.recThrs) - K = len(p.catIds) if p.useCats else 1 - A = len(p.areaRng) - M = len(p.maxDets) - precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories - recall = -np.ones((T,K,A,M)) - - # create dictionary for future indexing - _pe = self._paramsEval - catIds = _pe.catIds if _pe.useCats else [-1] - setK = set(catIds) - setA = set(map(tuple, _pe.areaRng)) - setM = set(_pe.maxDets) - setI = set(_pe.imgIds) - # get inds to evaluate - k_list = [n for n, k in enumerate(p.catIds) if k in setK] - m_list = [m for n, m in enumerate(p.maxDets) if m in setM] - a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA] - i_list = [n for n, i in enumerate(p.imgIds) if i in setI] - I0 = len(_pe.imgIds) - A0 = len(_pe.areaRng) - # retrieve E at each category, area range, and max number of detections - for k, k0 in enumerate(k_list): - Nk = k0*A0*I0 - for a, a0 in enumerate(a_list): - Na = a0*I0 - for m, maxDet in enumerate(m_list): - E = [self.evalImgs[Nk + Na + i] for i in i_list] - E = [e for e in E if not e is None] - if len(E) == 0: - continue - dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E]) - - # different sorting method generates slightly different results. - # mergesort is used to be consistent as Matlab implementation. - inds = np.argsort(-dtScores, kind='mergesort') - - dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds] - dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds] - gtIg = np.concatenate([e['gtIgnore'] for e in E]) - npig = np.count_nonzero(gtIg==0 ) - if npig == 0: - continue - tps = np.logical_and( dtm, np.logical_not(dtIg) ) - fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) ) - - tp_sum = np.cumsum(tps, axis=1).astype(dtype=np.float) - fp_sum = np.cumsum(fps, axis=1).astype(dtype=np.float) - for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): - tp = np.array(tp) - fp = np.array(fp) - nd = len(tp) - rc = tp / npig - pr = tp / (fp+tp+np.spacing(1)) - q = np.zeros((R,)) - - if nd: - recall[t,k,a,m] = rc[-1] - else: - recall[t,k,a,m] = 0 - - # numpy is slow without cython optimization for accessing elements - # use python array gets significant speed improvement - pr = pr.tolist(); q = q.tolist() - - for i in range(nd-1, 0, -1): - if pr[i] > pr[i-1]: - pr[i-1] = pr[i] - - inds = np.searchsorted(rc, p.recThrs, side='left') - try: - for ri, pi in enumerate(inds): - q[ri] = pr[pi] - except: - pass - precision[t,:,k,a,m] = np.array(q) - self.eval = { - 'params': p, - 'counts': [T, R, K, A, M], - 'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), - 'precision': precision, - 'recall': recall, - } - toc = time.time() - print('DONE (t={:0.2f}s).'.format( toc-tic)) - - def summarize(self): - ''' - Compute and display summary metrics for evaluation results. - Note this functin can *only* be applied on the default parameter setting - ''' - def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ): - p = self.params - iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}' - titleStr = 'Average Precision' if ap == 1 else 'Average Recall' - typeStr = '(AP)' if ap==1 else '(AR)' - iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \ - if iouThr is None else '{:0.2f}'.format(iouThr) - - aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] - mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] - if ap == 1: - # dimension of precision: [TxRxKxAxM] - s = self.eval['precision'] - # IoU - if iouThr is not None: - t = np.where(iouThr == p.iouThrs)[0] - s = s[t] - s = s[:,:,:,aind,mind] - else: - # dimension of recall: [TxKxAxM] - s = self.eval['recall'] - if iouThr is not None: - t = np.where(iouThr == p.iouThrs)[0] - s = s[t] - s = s[:,:,aind,mind] - if len(s[s>-1])==0: - mean_s = -1 - else: - mean_s = np.mean(s[s>-1]) - print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) - return mean_s - def _summarizeDets(): - stats = np.zeros((12,)) - stats[0] = _summarize(1) - stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2]) - stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2]) - stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2]) - stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2]) - stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2]) - stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) - stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) - stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) - stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2]) - stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2]) - stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2]) - return stats - def _summarizeKps(): - stats = np.zeros((10,)) - stats[0] = _summarize(1, maxDets=20) - stats[1] = _summarize(1, maxDets=20, iouThr=.5) - stats[2] = _summarize(1, maxDets=20, iouThr=.75) - stats[3] = _summarize(1, maxDets=20, areaRng='medium') - stats[4] = _summarize(1, maxDets=20, areaRng='large') - stats[5] = _summarize(0, maxDets=20) - stats[6] = _summarize(0, maxDets=20, iouThr=.5) - stats[7] = _summarize(0, maxDets=20, iouThr=.75) - stats[8] = _summarize(0, maxDets=20, areaRng='medium') - stats[9] = _summarize(0, maxDets=20, areaRng='large') - return stats - if not self.eval: - raise Exception('Please run accumulate() first') - iouType = self.params.iouType - if iouType == 'segm' or iouType == 'bbox': - summarize = _summarizeDets - elif iouType == 'keypoints': - summarize = _summarizeKps - self.stats = summarize() - - def __str__(self): - self.summarize() - -class Params: - ''' - Params for coco evaluation api - ''' - def setDetParams(self): - self.imgIds = [] - self.catIds = [] - # np.arange causes trouble. the data point on arange is slightly larger than the true value - self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True) - self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True) - self.maxDets = [1, 10, 100] - self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] - self.areaRngLbl = ['all', 'small', 'medium', 'large'] - self.useCats = 1 - - def setKpParams(self): - self.imgIds = [] - self.catIds = [] - # np.arange causes trouble. the data point on arange is slightly larger than the true value - self.iouThrs = np.linspace(.5, 0.95, np.round((0.95 - .5) / .05) + 1, endpoint=True) - self.recThrs = np.linspace(.0, 1.00, np.round((1.00 - .0) / .01) + 1, endpoint=True) - self.maxDets = [20] - self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] - self.areaRngLbl = ['all', 'medium', 'large'] - self.useCats = 1 - - def __init__(self, iouType='segm'): - if iouType == 'segm' or iouType == 'bbox': - self.setDetParams() - elif iouType == 'keypoints': - self.setKpParams() - else: - raise Exception('iouType not supported') - self.iouType = iouType - # useSegm is deprecated - self.useSegm = None diff --git a/example/rcnn/rcnn/pycocotools/mask.py b/example/rcnn/rcnn/pycocotools/mask.py deleted file mode 100644 index 2122468f6817..000000000000 --- a/example/rcnn/rcnn/pycocotools/mask.py +++ /dev/null @@ -1,120 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -__author__ = 'tsungyi' - -from rcnn.pycocotools import _mask - -# Interface for manipulating masks stored in RLE format. -# -# RLE is a simple yet efficient format for storing binary masks. RLE -# first divides a vector (or vectorized image) into a series of piecewise -# constant regions and then for each piece simply stores the length of -# that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would -# be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] -# (note that the odd counts are always the numbers of zeros). Instead of -# storing the counts directly, additional compression is achieved with a -# variable bitrate representation based on a common scheme called LEB128. -# -# Compression is greatest given large piecewise constant regions. -# Specifically, the size of the RLE is proportional to the number of -# *boundaries* in M (or for an image the number of boundaries in the y -# direction). Assuming fairly simple shapes, the RLE representation is -# O(sqrt(n)) where n is number of pixels in the object. Hence space usage -# is substantially lower, especially for large simple objects (large n). -# -# Many common operations on masks can be computed directly using the RLE -# (without need for decoding). This includes computations such as area, -# union, intersection, etc. All of these operations are linear in the -# size of the RLE, in other words they are O(sqrt(n)) where n is the area -# of the object. Computing these operations on the original mask is O(n). -# Thus, using the RLE can result in substantial computational savings. -# -# The following API functions are defined: -# encode - Encode binary masks using RLE. -# decode - Decode binary masks encoded via RLE. -# merge - Compute union or intersection of encoded masks. -# iou - Compute intersection over union between masks. -# area - Compute area of encoded masks. -# toBbox - Get bounding boxes surrounding encoded masks. -# frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. -# -# Usage: -# Rs = encode( masks ) -# masks = decode( Rs ) -# R = merge( Rs, intersect=false ) -# o = iou( dt, gt, iscrowd ) -# a = area( Rs ) -# bbs = toBbox( Rs ) -# Rs = frPyObjects( [pyObjects], h, w ) -# -# In the API the following formats are used: -# Rs - [dict] Run-length encoding of binary masks -# R - dict Run-length encoding of binary mask -# masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) -# iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore -# bbs - [nx4] Bounding box(es) stored as [x y w h] -# poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) -# dt,gt - May be either bounding boxes or encoded masks -# Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). -# -# Finally, a note about the intersection over union (iou) computation. -# The standard iou of a ground truth (gt) and detected (dt) object is -# iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) -# For "crowd" regions, we use a modified criteria. If a gt object is -# marked as "iscrowd", we allow a dt to match any subregion of the gt. -# Choosing gt' in the crowd gt that best matches the dt can be done using -# gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing -# iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) -# For crowd gt regions we use this modified criteria above for the iou. -# -# To compile run "python setup.py build_ext --inplace" -# Please do not contact us for help with compiling. -# -# Microsoft COCO Toolbox. version 2.0 -# Data, paper, and tutorials available at: http://mscoco.org/ -# Code written by Piotr Dollar and Tsung-Yi Lin, 2015. -# Licensed under the Simplified BSD License [see coco/license.txt] - -iou = _mask.iou -merge = _mask.merge -frPyObjects = _mask.frPyObjects - -def encode(bimask): - if len(bimask.shape) == 3: - return _mask.encode(bimask) - elif len(bimask.shape) == 2: - h, w = bimask.shape - return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] - -def decode(rleObjs): - if type(rleObjs) == list: - return _mask.decode(rleObjs) - else: - return _mask.decode([rleObjs])[:,:,0] - -def area(rleObjs): - if type(rleObjs) == list: - return _mask.area(rleObjs) - else: - return _mask.area([rleObjs])[0] - -def toBbox(rleObjs): - if type(rleObjs) == list: - return _mask.toBbox(rleObjs) - else: - return _mask.toBbox([rleObjs])[0] diff --git a/example/rcnn/rcnn/pycocotools/maskApi.c b/example/rcnn/rcnn/pycocotools/maskApi.c deleted file mode 100644 index 9dd660de1252..000000000000 --- a/example/rcnn/rcnn/pycocotools/maskApi.c +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/************************************************************************** -* Microsoft COCO Toolbox. version 2.0 -* Data, paper, and tutorials available at: http://mscoco.org/ -* Code written by Piotr Dollar and Tsung-Yi Lin, 2015. -* Licensed under the Simplified BSD License [see coco/license.txt] -**************************************************************************/ -#include "maskApi.h" -#include -#include - -uint umin( uint a, uint b ) { return (ab) ? a : b; } - -void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ) { - R->h=h; R->w=w; R->m=m; R->cnts=(m==0)?0:malloc(sizeof(uint)*m); - siz j; if(cnts) for(j=0; jcnts[j]=cnts[j]; -} - -void rleFree( RLE *R ) { - free(R->cnts); R->cnts=0; -} - -void rlesInit( RLE **R, siz n ) { - siz i; *R = (RLE*) malloc(sizeof(RLE)*n); - for(i=0; i0 ) { - c=umin(ca,cb); cc+=c; ct=0; - ca-=c; if(!ca && a0) { - crowd=iscrowd!=NULL && iscrowd[g]; - if(dt[d].h!=gt[g].h || dt[d].w!=gt[g].w) { o[g*m+d]=-1; continue; } - siz ka, kb, a, b; uint c, ca, cb, ct, i, u; int va, vb; - ca=dt[d].cnts[0]; ka=dt[d].m; va=vb=0; - cb=gt[g].cnts[0]; kb=gt[g].m; a=b=1; i=u=0; ct=1; - while( ct>0 ) { - c=umin(ca,cb); if(va||vb) { u+=c; if(va&&vb) i+=c; } ct=0; - ca-=c; if(!ca && athr) keep[j]=0; - } - } -} - -void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ) { - double h, w, i, u, ga, da; siz g, d; int crowd; - for( g=0; gthr) keep[j]=0; - } - } -} - -void rleToBbox( const RLE *R, BB bb, siz n ) { - siz i; for( i=0; id?1:c=dy && xs>xe) || (dxye); - if(flip) { t=xs; xs=xe; xe=t; t=ys; ys=ye; ye=t; } - s = dx>=dy ? (double)(ye-ys)/dx : (double)(xe-xs)/dy; - if(dx>=dy) for( d=0; d<=dx; d++ ) { - t=flip?dx-d:d; u[m]=t+xs; v[m]=(int)(ys+s*t+.5); m++; - } else for( d=0; d<=dy; d++ ) { - t=flip?dy-d:d; v[m]=t+ys; u[m]=(int)(xs+s*t+.5); m++; - } - } - /* get points along y-boundary and downsample */ - free(x); free(y); k=m; m=0; double xd, yd; - x=malloc(sizeof(int)*k); y=malloc(sizeof(int)*k); - for( j=1; jw-1 ) continue; - yd=(double)(v[j]h) yd=h; yd=ceil(yd); - x[m]=(int) xd; y[m]=(int) yd; m++; - } - /* compute rle encoding given y-boundary points */ - k=m; a=malloc(sizeof(uint)*(k+1)); - for( j=0; j0) b[m++]=a[j++]; else { - j++; if(jm, p=0; long x; int more; - char *s=malloc(sizeof(char)*m*6); - for( i=0; icnts[i]; if(i>2) x-=(long) R->cnts[i-2]; more=1; - while( more ) { - char c=x & 0x1f; x >>= 5; more=(c & 0x10) ? x!=-1 : x!=0; - if(more) c |= 0x20; c+=48; s[p++]=c; - } - } - s[p]=0; return s; -} - -void rleFrString( RLE *R, char *s, siz h, siz w ) { - siz m=0, p=0, k; long x; int more; uint *cnts; - while( s[m] ) m++; cnts=malloc(sizeof(uint)*m); m=0; - while( s[p] ) { - x=0; k=0; more=1; - while( more ) { - char c=s[p]-48; x |= (c & 0x1f) << 5*k; - more = c & 0x20; p++; k++; - if(!more && (c & 0x10)) x |= -1 << 5*k; - } - if(m>2) x+=(long) cnts[m-2]; cnts[m++]=(uint) x; - } - rleInit(R,h,w,m,cnts); free(cnts); -} diff --git a/example/rcnn/rcnn/pycocotools/maskApi.h b/example/rcnn/rcnn/pycocotools/maskApi.h deleted file mode 100644 index 56b4c0c4c704..000000000000 --- a/example/rcnn/rcnn/pycocotools/maskApi.h +++ /dev/null @@ -1,79 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/************************************************************************** -* Microsoft COCO Toolbox. version 2.0 -* Data, paper, and tutorials available at: http://mscoco.org/ -* Code written by Piotr Dollar and Tsung-Yi Lin, 2015. -* Licensed under the Simplified BSD License [see coco/license.txt] -**************************************************************************/ -#pragma once - -typedef unsigned int uint; -typedef unsigned long siz; -typedef unsigned char byte; -typedef double* BB; -typedef struct { siz h, w, m; uint *cnts; } RLE; - -/* Initialize/destroy RLE. */ -void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); -void rleFree( RLE *R ); - -/* Initialize/destroy RLE array. */ -void rlesInit( RLE **R, siz n ); -void rlesFree( RLE **R, siz n ); - -/* Encode binary masks using RLE. */ -void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); - -/* Decode binary masks encoded via RLE. */ -void rleDecode( const RLE *R, byte *mask, siz n ); - -/* Compute union or intersection of encoded masks. */ -void rleMerge( const RLE *R, RLE *M, siz n, int intersect ); - -/* Compute area of encoded masks. */ -void rleArea( const RLE *R, siz n, uint *a ); - -/* Compute intersection over union between masks. */ -void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); - -/* Compute non-maximum suppression between bounding masks */ -void rleNms( RLE *dt, siz n, uint *keep, double thr ); - -/* Compute intersection over union between bounding boxes. */ -void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); - -/* Compute non-maximum suppression between bounding boxes */ -void bbNms( BB dt, siz n, uint *keep, double thr ); - -/* Get bounding boxes surrounding encoded masks. */ -void rleToBbox( const RLE *R, BB bb, siz n ); - -/* Convert bounding boxes to encoded masks. */ -void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); - -/* Convert polygon to encoded mask. */ -void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); - -/* Get compressed string representation of encoded mask. */ -char* rleToString( const RLE *R ); - -/* Convert from compressed string representation of encoded mask. */ -void rleFrString( RLE *R, char *s, siz h, siz w ); diff --git a/example/rcnn/rcnn/pycocotools/setup.py b/example/rcnn/rcnn/pycocotools/setup.py deleted file mode 100644 index d7074e910ee5..000000000000 --- a/example/rcnn/rcnn/pycocotools/setup.py +++ /dev/null @@ -1,37 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from distutils.core import setup -from Cython.Build import cythonize -from distutils.extension import Extension -import numpy as np - -# To compile and install locally run "python setup.py build_ext --inplace" -# To install library to Python site-packages run "python setup.py build_ext install" - -ext_modules = [ - Extension( - '_mask', - sources=['maskApi.c', '_mask.pyx'], - include_dirs=[np.get_include()], - extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'], - ) -] - -setup(name='pycocotools', - ext_modules=cythonize(ext_modules) -) diff --git a/example/rcnn/rcnn/symbol/__init__.py b/example/rcnn/rcnn/symbol/__init__.py deleted file mode 100644 index 7547122dd5f9..000000000000 --- a/example/rcnn/rcnn/symbol/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from .symbol_vgg import * -from .symbol_resnet import * diff --git a/example/rcnn/rcnn/symbol/proposal.py b/example/rcnn/rcnn/symbol/proposal.py deleted file mode 100644 index 64981513980b..000000000000 --- a/example/rcnn/rcnn/symbol/proposal.py +++ /dev/null @@ -1,241 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Proposal Operator transform anchor coordinates into ROI coordinates with prediction results on -classification probability and bounding box prediction results, and image size and scale information. -""" - -import mxnet as mx -import numpy as np -import numpy.random as npr -from distutils.util import strtobool - -from rcnn.logger import logger -from rcnn.processing.bbox_transform import bbox_pred, clip_boxes -from rcnn.processing.generate_anchor import generate_anchors -from rcnn.processing.nms import py_nms_wrapper, cpu_nms_wrapper, gpu_nms_wrapper - - -class ProposalOperator(mx.operator.CustomOp): - def __init__(self, feat_stride, scales, ratios, output_score, - rpn_pre_nms_top_n, rpn_post_nms_top_n, threshold, rpn_min_size): - super(ProposalOperator, self).__init__() - self._feat_stride = feat_stride - self._scales = np.fromstring(scales[1:-1], dtype=float, sep=',') - self._ratios = np.fromstring(ratios[1:-1], dtype=float, sep=',') - self._anchors = generate_anchors(base_size=self._feat_stride, scales=self._scales, ratios=self._ratios) - self._num_anchors = self._anchors.shape[0] - self._output_score = output_score - self._rpn_pre_nms_top_n = rpn_pre_nms_top_n - self._rpn_post_nms_top_n = rpn_post_nms_top_n - self._threshold = threshold - self._rpn_min_size = rpn_min_size - - logger.debug('feat_stride: %s' % self._feat_stride) - logger.debug('anchors:\n%s' % self._anchors) - - def forward(self, is_train, req, in_data, out_data, aux): - nms = gpu_nms_wrapper(self._threshold, in_data[0].context.device_id) - - batch_size = in_data[0].shape[0] - if batch_size > 1: - raise ValueError("Sorry, multiple images each device is not implemented") - - # for each (H, W) location i - # generate A anchor boxes centered on cell i - # apply predicted bbox deltas at cell i to each of the A anchors - # clip predicted boxes to image - # remove predicted boxes with either height or width < threshold - # sort all (proposal, score) pairs by score from highest to lowest - # take top pre_nms_topN proposals before NMS - # apply NMS with threshold 0.7 to remaining proposals - # take after_nms_topN proposals after NMS - # return the top proposals (-> RoIs top, scores top) - - pre_nms_topN = self._rpn_pre_nms_top_n - post_nms_topN = self._rpn_post_nms_top_n - min_size = self._rpn_min_size - - # the first set of anchors are background probabilities - # keep the second part - scores = in_data[0].asnumpy()[:, self._num_anchors:, :, :] - bbox_deltas = in_data[1].asnumpy() - im_info = in_data[2].asnumpy()[0, :] - - logger.debug('im_info: %s' % im_info) - - # 1. Generate proposals from bbox_deltas and shifted anchors - # use real image size instead of padded feature map sizes - height, width = int(im_info[0] / self._feat_stride), int(im_info[1] / self._feat_stride) - - logger.debug('score map size: (%d, %d)' % (scores.shape[2], scores.shape[3])) - logger.debug('resudial: (%d, %d)' % (scores.shape[2] - height, scores.shape[3] - width)) - - # Enumerate all shifts - shift_x = np.arange(0, width) * self._feat_stride - shift_y = np.arange(0, height) * self._feat_stride - shift_x, shift_y = np.meshgrid(shift_x, shift_y) - shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() - - # Enumerate all shifted anchors: - # - # add A anchors (1, A, 4) to - # cell K shifts (K, 1, 4) to get - # shift anchors (K, A, 4) - # reshape to (K*A, 4) shifted anchors - A = self._num_anchors - K = shifts.shape[0] - anchors = self._anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)) - anchors = anchors.reshape((K * A, 4)) - - # Transpose and reshape predicted bbox transformations to get them - # into the same order as the anchors: - # - # bbox deltas will be (1, 4 * A, H, W) format - # transpose to (1, H, W, 4 * A) - # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) - # in slowest to fastest order - bbox_deltas = self._clip_pad(bbox_deltas, (height, width)) - bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) - - # Same story for the scores: - # - # scores are (1, A, H, W) format - # transpose to (1, H, W, A) - # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) - scores = self._clip_pad(scores, (height, width)) - scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) - - # Convert anchors into proposals via bbox transformations - proposals = bbox_pred(anchors, bbox_deltas) - - # 2. clip predicted boxes to image - proposals = clip_boxes(proposals, im_info[:2]) - - # 3. remove predicted boxes with either height or width < threshold - # (NOTE: convert min_size to input image scale stored in im_info[2]) - keep = self._filter_boxes(proposals, min_size * im_info[2]) - proposals = proposals[keep, :] - scores = scores[keep] - - # 4. sort all (proposal, score) pairs by score from highest to lowest - # 5. take top pre_nms_topN (e.g. 6000) - order = scores.ravel().argsort()[::-1] - if pre_nms_topN > 0: - order = order[:pre_nms_topN] - proposals = proposals[order, :] - scores = scores[order] - - # 6. apply nms (e.g. threshold = 0.7) - # 7. take after_nms_topN (e.g. 300) - # 8. return the top proposals (-> RoIs top) - det = np.hstack((proposals, scores)).astype(np.float32) - keep = nms(det) - if post_nms_topN > 0: - keep = keep[:post_nms_topN] - # pad to ensure output size remains unchanged - if len(keep) < post_nms_topN: - pad = npr.choice(keep, size=post_nms_topN - len(keep)) - keep = np.hstack((keep, pad)) - proposals = proposals[keep, :] - scores = scores[keep] - - # Output rois array - # Our RPN implementation only supports a single input image, so all - # batch inds are 0 - batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) - blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) - self.assign(out_data[0], req[0], blob) - - if self._output_score: - self.assign(out_data[1], req[1], scores.astype(np.float32, copy=False)) - - def backward(self, req, out_grad, in_data, out_data, in_grad, aux): - self.assign(in_grad[0], req[0], 0) - self.assign(in_grad[1], req[1], 0) - self.assign(in_grad[2], req[2], 0) - - @staticmethod - def _filter_boxes(boxes, min_size): - """ Remove all boxes with any side smaller than min_size """ - ws = boxes[:, 2] - boxes[:, 0] + 1 - hs = boxes[:, 3] - boxes[:, 1] + 1 - keep = np.where((ws >= min_size) & (hs >= min_size))[0] - return keep - - @staticmethod - def _clip_pad(tensor, pad_shape): - """ - Clip boxes of the pad area. - :param tensor: [n, c, H, W] - :param pad_shape: [h, w] - :return: [n, c, h, w] - """ - H, W = tensor.shape[2:] - h, w = pad_shape - - if h < H or w < W: - tensor = tensor[:, :, :h, :w].copy() - - return tensor - - -@mx.operator.register("proposal") -class ProposalProp(mx.operator.CustomOpProp): - def __init__(self, feat_stride='16', scales='(8, 16, 32)', ratios='(0.5, 1, 2)', output_score='False', - rpn_pre_nms_top_n='6000', rpn_post_nms_top_n='300', threshold='0.3', rpn_min_size='16'): - super(ProposalProp, self).__init__(need_top_grad=False) - self._feat_stride = int(feat_stride) - self._scales = scales - self._ratios = ratios - self._output_score = strtobool(output_score) - self._rpn_pre_nms_top_n = int(rpn_pre_nms_top_n) - self._rpn_post_nms_top_n = int(rpn_post_nms_top_n) - self._threshold = float(threshold) - self._rpn_min_size = int(rpn_min_size) - - def list_arguments(self): - return ['cls_prob', 'bbox_pred', 'im_info'] - - def list_outputs(self): - if self._output_score: - return ['output', 'score'] - else: - return ['output'] - - def infer_shape(self, in_shape): - cls_prob_shape = in_shape[0] - bbox_pred_shape = in_shape[1] - assert cls_prob_shape[0] == bbox_pred_shape[0], 'ROI number does not equal in cls and reg' - - batch_size = cls_prob_shape[0] - im_info_shape = (batch_size, 3) - output_shape = (self._rpn_post_nms_top_n, 5) - score_shape = (self._rpn_post_nms_top_n, 1) - - if self._output_score: - return [cls_prob_shape, bbox_pred_shape, im_info_shape], [output_shape, score_shape] - else: - return [cls_prob_shape, bbox_pred_shape, im_info_shape], [output_shape] - - def create_operator(self, ctx, shapes, dtypes): - return ProposalOperator(self._feat_stride, self._scales, self._ratios, self._output_score, - self._rpn_pre_nms_top_n, self._rpn_post_nms_top_n, self._threshold, self._rpn_min_size) - - def declare_backward_dependency(self, out_grad, in_data, out_data): - return [] diff --git a/example/rcnn/rcnn/symbol/proposal_target.py b/example/rcnn/rcnn/symbol/proposal_target.py deleted file mode 100644 index 0af19a9cf332..000000000000 --- a/example/rcnn/rcnn/symbol/proposal_target.py +++ /dev/null @@ -1,113 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" -Proposal Target Operator selects foreground and background roi and assigns label, bbox_transform to them. -""" - -import logging -import mxnet as mx -import numpy as np -from distutils.util import strtobool - -from ..logger import logger -from rcnn.io.rcnn import sample_rois - - -class ProposalTargetOperator(mx.operator.CustomOp): - def __init__(self, num_classes, batch_images, batch_rois, fg_fraction): - super(ProposalTargetOperator, self).__init__() - self._num_classes = num_classes - self._batch_images = batch_images - self._batch_rois = batch_rois - self._fg_fraction = fg_fraction - - if logger.level == logging.DEBUG: - self._count = 0 - self._fg_num = 0 - self._bg_num = 0 - - def forward(self, is_train, req, in_data, out_data, aux): - assert self._batch_rois % self._batch_images == 0, \ - 'BATCHIMAGES {} must devide BATCH_ROIS {}'.format(self._batch_images, self._batch_rois) - rois_per_image = self._batch_rois / self._batch_images - fg_rois_per_image = np.round(self._fg_fraction * rois_per_image).astype(np.int) - - all_rois = in_data[0].asnumpy() - gt_boxes = in_data[1].asnumpy() - - # Include ground-truth boxes in the set of candidate rois - zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) - all_rois = np.vstack((all_rois, np.hstack((zeros, gt_boxes[:, :-1])))) - # Sanity check: single batch only - assert np.all(all_rois[:, 0] == 0), 'Only single item batches are supported' - - rois, labels, bbox_targets, bbox_weights = \ - sample_rois(all_rois, fg_rois_per_image, rois_per_image, self._num_classes, gt_boxes=gt_boxes) - - if logger.level == logging.DEBUG: - logger.debug("labels: %s" % labels) - logger.debug('num fg: {}'.format((labels > 0).sum())) - logger.debug('num bg: {}'.format((labels == 0).sum())) - self._count += 1 - self._fg_num += (labels > 0).sum() - self._bg_num += (labels == 0).sum() - logger.debug("self._count: %d" % self._count) - logger.debug('num fg avg: %d' % (self._fg_num / self._count)) - logger.debug('num bg avg: %d' % (self._bg_num / self._count)) - logger.debug('ratio: %.3f' % (float(self._fg_num) / float(self._bg_num))) - - for ind, val in enumerate([rois, labels, bbox_targets, bbox_weights]): - self.assign(out_data[ind], req[ind], val) - - def backward(self, req, out_grad, in_data, out_data, in_grad, aux): - self.assign(in_grad[0], req[0], 0) - self.assign(in_grad[1], req[1], 0) - - -@mx.operator.register('proposal_target') -class ProposalTargetProp(mx.operator.CustomOpProp): - def __init__(self, num_classes, batch_images, batch_rois, fg_fraction='0.25'): - super(ProposalTargetProp, self).__init__(need_top_grad=False) - self._num_classes = int(num_classes) - self._batch_images = int(batch_images) - self._batch_rois = int(batch_rois) - self._fg_fraction = float(fg_fraction) - - def list_arguments(self): - return ['rois', 'gt_boxes'] - - def list_outputs(self): - return ['rois_output', 'label', 'bbox_target', 'bbox_weight'] - - def infer_shape(self, in_shape): - rpn_rois_shape = in_shape[0] - gt_boxes_shape = in_shape[1] - - output_rois_shape = (self._batch_rois, 5) - label_shape = (self._batch_rois, ) - bbox_target_shape = (self._batch_rois, self._num_classes * 4) - bbox_weight_shape = (self._batch_rois, self._num_classes * 4) - - return [rpn_rois_shape, gt_boxes_shape], \ - [output_rois_shape, label_shape, bbox_target_shape, bbox_weight_shape] - - def create_operator(self, ctx, shapes, dtypes): - return ProposalTargetOperator(self._num_classes, self._batch_images, self._batch_rois, self._fg_fraction) - - def declare_backward_dependency(self, out_grad, in_data, out_data): - return [] diff --git a/example/rcnn/rcnn/symbol/symbol_vgg.py b/example/rcnn/rcnn/symbol/symbol_vgg.py deleted file mode 100644 index 33fbede2df1d..000000000000 --- a/example/rcnn/rcnn/symbol/symbol_vgg.py +++ /dev/null @@ -1,424 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import mxnet as mx -from rcnn.config import config -from . import proposal -from . import proposal_target - -def get_vgg_conv(data): - """ - shared convolutional layers - :param data: Symbol - :return: Symbol - """ - # group 1 - conv1_1 = mx.symbol.Convolution( - data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, workspace=2048, name="conv1_1") - relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1") - conv1_2 = mx.symbol.Convolution( - data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, workspace=2048, name="conv1_2") - relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2") - pool1 = mx.symbol.Pooling( - data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1") - # group 2 - conv2_1 = mx.symbol.Convolution( - data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, workspace=2048, name="conv2_1") - relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1") - conv2_2 = mx.symbol.Convolution( - data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, workspace=2048, name="conv2_2") - relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2") - pool2 = mx.symbol.Pooling( - data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2") - # group 3 - conv3_1 = mx.symbol.Convolution( - data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_1") - relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1") - conv3_2 = mx.symbol.Convolution( - data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_2") - relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2") - conv3_3 = mx.symbol.Convolution( - data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_3") - relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3") - pool3 = mx.symbol.Pooling( - data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool3") - # group 4 - conv4_1 = mx.symbol.Convolution( - data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_1") - relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1") - conv4_2 = mx.symbol.Convolution( - data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_2") - relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2") - conv4_3 = mx.symbol.Convolution( - data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_3") - relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3") - pool4 = mx.symbol.Pooling( - data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4") - # group 5 - conv5_1 = mx.symbol.Convolution( - data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_1") - relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1") - conv5_2 = mx.symbol.Convolution( - data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_2") - relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2") - conv5_3 = mx.symbol.Convolution( - data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_3") - relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3") - - return relu5_3 - - -def get_vgg_rcnn(num_classes=config.NUM_CLASSES): - """ - Fast R-CNN with VGG 16 conv layers - :param num_classes: used to determine output size - :return: Symbol - """ - data = mx.symbol.Variable(name="data") - rois = mx.symbol.Variable(name='rois') - label = mx.symbol.Variable(name='label') - bbox_target = mx.symbol.Variable(name='bbox_target') - bbox_weight = mx.symbol.Variable(name='bbox_weight') - - # reshape input - rois = mx.symbol.Reshape(data=rois, shape=(-1, 5), name='rois_reshape') - label = mx.symbol.Reshape(data=label, shape=(-1, ), name='label_reshape') - bbox_target = mx.symbol.Reshape(data=bbox_target, shape=(-1, 4 * num_classes), name='bbox_target_reshape') - bbox_weight = mx.symbol.Reshape(data=bbox_weight, shape=(-1, 4 * num_classes), name='bbox_weight_reshape') - - # shared convolutional layers - relu5_3 = get_vgg_conv(data) - - # Fast R-CNN - pool5 = mx.symbol.ROIPooling( - name='roi_pool5', data=relu5_3, rois=rois, pooled_size=(7, 7), spatial_scale=1.0 / config.RCNN_FEAT_STRIDE) - # group 6 - flatten = mx.symbol.Flatten(data=pool5, name="flatten") - fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6") - relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6") - drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6") - # group 7 - fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7") - relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7") - drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7") - # classification - cls_score = mx.symbol.FullyConnected(name='cls_score', data=drop7, num_hidden=num_classes) - cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score, label=label, normalization='batch') - # bounding box regression - bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4) - bbox_loss_ = bbox_weight * mx.symbol.smooth_l1(name='bbox_loss_', scalar=1.0, data=(bbox_pred - bbox_target)) - bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_, grad_scale=1.0 / config.TRAIN.BATCH_ROIS) - - # reshape output - cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TRAIN.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape') - bbox_loss = mx.symbol.Reshape(data=bbox_loss, shape=(config.TRAIN.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_loss_reshape') - - # group output - group = mx.symbol.Group([cls_prob, bbox_loss]) - return group - - -def get_vgg_rcnn_test(num_classes=config.NUM_CLASSES): - """ - Fast R-CNN Network with VGG - :param num_classes: used to determine output size - :return: Symbol - """ - data = mx.symbol.Variable(name="data") - rois = mx.symbol.Variable(name='rois') - - # reshape rois - rois = mx.symbol.Reshape(data=rois, shape=(-1, 5), name='rois_reshape') - - # shared convolutional layer - relu5_3 = get_vgg_conv(data) - - # Fast R-CNN - pool5 = mx.symbol.ROIPooling( - name='roi_pool5', data=relu5_3, rois=rois, pooled_size=(7, 7), spatial_scale=1.0 / config.RCNN_FEAT_STRIDE) - # group 6 - flatten = mx.symbol.Flatten(data=pool5, name="flatten") - fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6") - relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6") - drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6") - # group 7 - fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7") - relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7") - drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7") - # classification - cls_score = mx.symbol.FullyConnected(name='cls_score', data=drop7, num_hidden=num_classes) - cls_prob = mx.symbol.softmax(name='cls_prob', data=cls_score) - # bounding box regression - bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4) - - # reshape output - cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TEST.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape') - bbox_pred = mx.symbol.Reshape(data=bbox_pred, shape=(config.TEST.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_pred_reshape') - - # group output - group = mx.symbol.Group([cls_prob, bbox_pred]) - return group - - -def get_vgg_rpn(num_anchors=config.NUM_ANCHORS): - """ - Region Proposal Network with VGG - :param num_anchors: used to determine output size - :return: Symbol - """ - data = mx.symbol.Variable(name="data") - label = mx.symbol.Variable(name='label') - bbox_target = mx.symbol.Variable(name='bbox_target') - bbox_weight = mx.symbol.Variable(name='bbox_weight') - - # shared convolutional layers - relu5_3 = get_vgg_conv(data) - - # RPN - rpn_conv = mx.symbol.Convolution( - data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3") - rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu") - rpn_cls_score = mx.symbol.Convolution( - data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score") - rpn_bbox_pred = mx.symbol.Convolution( - data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") - - # prepare rpn data - rpn_cls_score_reshape = mx.symbol.Reshape( - data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") - - # classification - cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, label=label, multi_output=True, - normalization='valid', use_ignore=True, ignore_label=-1, name="cls_prob") - # bounding box regression - bbox_loss_ = bbox_weight * mx.symbol.smooth_l1(name='bbox_loss_', scalar=3.0, data=(rpn_bbox_pred - bbox_target)) - bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_, grad_scale=1.0 / config.TRAIN.RPN_BATCH_SIZE) - # group output - group = mx.symbol.Group([cls_prob, bbox_loss]) - return group - - -def get_vgg_rpn_test(num_anchors=config.NUM_ANCHORS): - """ - Region Proposal Network with VGG - :param num_anchors: used to determine output size - :return: Symbol - """ - data = mx.symbol.Variable(name="data") - im_info = mx.symbol.Variable(name="im_info") - - # shared convolutional layers - relu5_3 = get_vgg_conv(data) - - # RPN - rpn_conv = mx.symbol.Convolution( - data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3") - rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu") - rpn_cls_score = mx.symbol.Convolution( - data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score") - rpn_bbox_pred = mx.symbol.Convolution( - data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") - - # ROI Proposal - rpn_cls_score_reshape = mx.symbol.Reshape( - data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") - rpn_cls_prob = mx.symbol.SoftmaxActivation( - data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_prob") - rpn_cls_prob_reshape = mx.symbol.Reshape( - data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape') - if config.TEST.CXX_PROPOSAL: - group = mx.symbol.contrib.Proposal( - cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', output_score=True, - feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS), - rpn_pre_nms_top_n=config.TEST.PROPOSAL_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.PROPOSAL_POST_NMS_TOP_N, - threshold=config.TEST.PROPOSAL_NMS_THRESH, rpn_min_size=config.TEST.PROPOSAL_MIN_SIZE) - else: - group = mx.symbol.Custom( - cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', output_score=True, - op_type='proposal', feat_stride=config.RPN_FEAT_STRIDE, - scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS), - rpn_pre_nms_top_n=config.TEST.PROPOSAL_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.PROPOSAL_POST_NMS_TOP_N, - threshold=config.TEST.PROPOSAL_NMS_THRESH, rpn_min_size=config.TEST.PROPOSAL_MIN_SIZE) - # rois = group[0] - # score = group[1] - - return group - - -def get_vgg_test(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS): - """ - Faster R-CNN test with VGG 16 conv layers - :param num_classes: used to determine output size - :param num_anchors: used to determine output size - :return: Symbol - """ - data = mx.symbol.Variable(name="data") - im_info = mx.symbol.Variable(name="im_info") - - # shared convolutional layers - relu5_3 = get_vgg_conv(data) - - # RPN - rpn_conv = mx.symbol.Convolution( - data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3") - rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu") - rpn_cls_score = mx.symbol.Convolution( - data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score") - rpn_bbox_pred = mx.symbol.Convolution( - data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") - - # ROI Proposal - rpn_cls_score_reshape = mx.symbol.Reshape( - data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") - rpn_cls_prob = mx.symbol.SoftmaxActivation( - data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_prob") - rpn_cls_prob_reshape = mx.symbol.Reshape( - data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape') - if config.TEST.CXX_PROPOSAL: - rois = mx.symbol.contrib.Proposal( - cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', - feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS), - rpn_pre_nms_top_n=config.TEST.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.RPN_POST_NMS_TOP_N, - threshold=config.TEST.RPN_NMS_THRESH, rpn_min_size=config.TEST.RPN_MIN_SIZE) - else: - rois = mx.symbol.Custom( - cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', - op_type='proposal', feat_stride=config.RPN_FEAT_STRIDE, - scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS), - rpn_pre_nms_top_n=config.TEST.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.RPN_POST_NMS_TOP_N, - threshold=config.TEST.RPN_NMS_THRESH, rpn_min_size=config.TEST.RPN_MIN_SIZE) - - # Fast R-CNN - pool5 = mx.symbol.ROIPooling( - name='roi_pool5', data=relu5_3, rois=rois, pooled_size=(7, 7), spatial_scale=1.0 / config.RCNN_FEAT_STRIDE) - # group 6 - flatten = mx.symbol.Flatten(data=pool5, name="flatten") - fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6") - relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6") - drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6") - # group 7 - fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7") - relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7") - drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7") - # classification - cls_score = mx.symbol.FullyConnected(name='cls_score', data=drop7, num_hidden=num_classes) - cls_prob = mx.symbol.softmax(name='cls_prob', data=cls_score) - # bounding box regression - bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4) - - # reshape output - cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TEST.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape') - bbox_pred = mx.symbol.Reshape(data=bbox_pred, shape=(config.TEST.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_pred_reshape') - - # group output - group = mx.symbol.Group([rois, cls_prob, bbox_pred]) - return group - - -def get_vgg_train(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS): - """ - Faster R-CNN end-to-end with VGG 16 conv layers - :param num_classes: used to determine output size - :param num_anchors: used to determine output size - :return: Symbol - """ - data = mx.symbol.Variable(name="data") - im_info = mx.symbol.Variable(name="im_info") - gt_boxes = mx.symbol.Variable(name="gt_boxes") - rpn_label = mx.symbol.Variable(name='label') - rpn_bbox_target = mx.symbol.Variable(name='bbox_target') - rpn_bbox_weight = mx.symbol.Variable(name='bbox_weight') - - # shared convolutional layers - relu5_3 = get_vgg_conv(data) - - # RPN layers - rpn_conv = mx.symbol.Convolution( - data=relu5_3, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3") - rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu") - rpn_cls_score = mx.symbol.Convolution( - data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score") - rpn_bbox_pred = mx.symbol.Convolution( - data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") - - # prepare rpn data - rpn_cls_score_reshape = mx.symbol.Reshape( - data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") - - # classification - rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, label=rpn_label, multi_output=True, - normalization='valid', use_ignore=True, ignore_label=-1, name="rpn_cls_prob") - # bounding box regression - rpn_bbox_loss_ = rpn_bbox_weight * mx.symbol.smooth_l1(name='rpn_bbox_loss_', scalar=3.0, data=(rpn_bbox_pred - rpn_bbox_target)) - rpn_bbox_loss = mx.sym.MakeLoss(name='rpn_bbox_loss', data=rpn_bbox_loss_, grad_scale=1.0 / config.TRAIN.RPN_BATCH_SIZE) - - # ROI proposal - rpn_cls_act = mx.symbol.SoftmaxActivation( - data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_act") - rpn_cls_act_reshape = mx.symbol.Reshape( - data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape') - if config.TRAIN.CXX_PROPOSAL: - rois = mx.symbol.contrib.Proposal( - cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', - feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS), - rpn_pre_nms_top_n=config.TRAIN.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TRAIN.RPN_POST_NMS_TOP_N, - threshold=config.TRAIN.RPN_NMS_THRESH, rpn_min_size=config.TRAIN.RPN_MIN_SIZE) - else: - rois = mx.symbol.Custom( - cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', - op_type='proposal', feat_stride=config.RPN_FEAT_STRIDE, - scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS), - rpn_pre_nms_top_n=config.TRAIN.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TRAIN.RPN_POST_NMS_TOP_N, - threshold=config.TRAIN.RPN_NMS_THRESH, rpn_min_size=config.TRAIN.RPN_MIN_SIZE) - - # ROI proposal target - gt_boxes_reshape = mx.symbol.Reshape(data=gt_boxes, shape=(-1, 5), name='gt_boxes_reshape') - group = mx.symbol.Custom(rois=rois, gt_boxes=gt_boxes_reshape, op_type='proposal_target', - num_classes=num_classes, batch_images=config.TRAIN.BATCH_IMAGES, - batch_rois=config.TRAIN.BATCH_ROIS, fg_fraction=config.TRAIN.FG_FRACTION) - rois = group[0] - label = group[1] - bbox_target = group[2] - bbox_weight = group[3] - - # Fast R-CNN - pool5 = mx.symbol.ROIPooling( - name='roi_pool5', data=relu5_3, rois=rois, pooled_size=(7, 7), spatial_scale=1.0 / config.RCNN_FEAT_STRIDE) - # group 6 - flatten = mx.symbol.Flatten(data=pool5, name="flatten") - fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6") - relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6") - drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6") - # group 7 - fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7") - relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7") - drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7") - # classification - cls_score = mx.symbol.FullyConnected(name='cls_score', data=drop7, num_hidden=num_classes) - cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score, label=label, normalization='batch') - # bounding box regression - bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=drop7, num_hidden=num_classes * 4) - bbox_loss_ = bbox_weight * mx.symbol.smooth_l1(name='bbox_loss_', scalar=1.0, data=(bbox_pred - bbox_target)) - bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_, grad_scale=1.0 / config.TRAIN.BATCH_ROIS) - - # reshape output - label = mx.symbol.Reshape(data=label, shape=(config.TRAIN.BATCH_IMAGES, -1), name='label_reshape') - cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TRAIN.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape') - bbox_loss = mx.symbol.Reshape(data=bbox_loss, shape=(config.TRAIN.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_loss_reshape') - - group = mx.symbol.Group([rpn_cls_prob, rpn_bbox_loss, cls_prob, bbox_loss, mx.symbol.BlockGrad(label)]) - return group diff --git a/example/rcnn/rcnn/tools/__init__.py b/example/rcnn/rcnn/tools/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/example/rcnn/rcnn/tools/reeval.py b/example/rcnn/rcnn/tools/reeval.py deleted file mode 100644 index 1e5c0aa5a840..000000000000 --- a/example/rcnn/rcnn/tools/reeval.py +++ /dev/null @@ -1,67 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -try: - import cPickle as pickle -except ImportError: - import pickle -import os -import mxnet as mx - -from ..logger import logger -from ..config import config, default, generate_config -from ..dataset import * - - -def reeval(args): - # load imdb - imdb = eval(args.dataset)(args.image_set, args.root_path, args.dataset_path) - - # load detection results - cache_file = os.path.join(imdb.cache_path, imdb.name, 'detections.pkl') - with open(cache_file) as f: - detections = pickle.load(f) - - # eval - imdb.evaluate_detections(detections) - - -def parse_args(): - parser = argparse.ArgumentParser(description='imdb test') - # general - parser.add_argument('--network', help='network name', default=default.network, type=str) - parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) - args, rest = parser.parse_known_args() - generate_config(args.network, args.dataset) - parser.add_argument('--image_set', help='image_set name', default=default.image_set, type=str) - parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) - parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) - # other - parser.add_argument('--no_shuffle', help='disable random shuffle', action='store_true') - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - logger.info('Called with argument: %s' % args) - reeval(args) - - -if __name__ == '__main__': - main() diff --git a/example/rcnn/rcnn/tools/test_rcnn.py b/example/rcnn/rcnn/tools/test_rcnn.py deleted file mode 100644 index 2c5c22223f14..000000000000 --- a/example/rcnn/rcnn/tools/test_rcnn.py +++ /dev/null @@ -1,126 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import pprint -import mxnet as mx - -from ..logger import logger -from ..config import config, default, generate_config -from ..symbol import * -from ..dataset import * -from ..core.loader import TestLoader -from ..core.tester import Predictor, pred_eval -from ..utils.load_model import load_param - - -def test_rcnn(network, dataset, image_set, root_path, dataset_path, - ctx, prefix, epoch, - vis, shuffle, has_rpn, proposal, thresh): - # set config - if has_rpn: - config.TEST.HAS_RPN = True - - # print config - pprint.pprint(config) - - # load symbol and testing data - if has_rpn: - sym = eval('get_' + network + '_test')(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS) - imdb = eval(dataset)(image_set, root_path, dataset_path) - roidb = imdb.gt_roidb() - else: - sym = eval('get_' + network + '_rcnn_test')(num_classes=config.NUM_CLASSES) - imdb = eval(dataset)(image_set, root_path, dataset_path) - gt_roidb = imdb.gt_roidb() - roidb = eval('imdb.' + proposal + '_roidb')(gt_roidb) - - # get test data iter - test_data = TestLoader(roidb, batch_size=1, shuffle=shuffle, has_rpn=has_rpn) - - # load model - arg_params, aux_params = load_param(prefix, epoch, convert=True, ctx=ctx, process=True) - - # infer shape - data_shape_dict = dict(test_data.provide_data) - arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict) - arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) - aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) - - # check parameters - for k in sym.list_arguments(): - if k in data_shape_dict or 'label' in k: - continue - assert k in arg_params, k + ' not initialized' - assert arg_params[k].shape == arg_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) - for k in sym.list_auxiliary_states(): - assert k in aux_params, k + ' not initialized' - assert aux_params[k].shape == aux_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) - - # decide maximum shape - data_names = [k[0] for k in test_data.provide_data] - label_names = None - max_data_shape = [('data', (1, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] - if not has_rpn: - max_data_shape.append(('rois', (1, config.TEST.PROPOSAL_POST_NMS_TOP_N + 30, 5))) - - # create predictor - predictor = Predictor(sym, data_names, label_names, - context=ctx, max_data_shapes=max_data_shape, - provide_data=test_data.provide_data, provide_label=test_data.provide_label, - arg_params=arg_params, aux_params=aux_params) - - # start detection - pred_eval(predictor, test_data, imdb, vis=vis, thresh=thresh) - - -def parse_args(): - parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') - # general - parser.add_argument('--network', help='network name', default=default.network, type=str) - parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) - args, rest = parser.parse_known_args() - generate_config(args.network, args.dataset) - parser.add_argument('--image_set', help='image_set name', default=default.test_image_set, type=str) - parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) - parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) - # testing - parser.add_argument('--prefix', help='model to test with', default=default.rcnn_prefix, type=str) - parser.add_argument('--epoch', help='model to test with', default=default.rcnn_epoch, type=int) - parser.add_argument('--gpu', help='GPU device to test with', default=0, type=int) - # rcnn - parser.add_argument('--vis', help='turn on visualization', action='store_true') - parser.add_argument('--thresh', help='valid detection threshold', default=1e-3, type=float) - parser.add_argument('--shuffle', help='shuffle data on visualization', action='store_true') - parser.add_argument('--has_rpn', help='generate proposals on the fly', action='store_true') - parser.add_argument('--proposal', help='can be ss for selective search or rpn', default='rpn', type=str) - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - logger.info('Called with argument: %s' % args) - ctx = mx.gpu(args.gpu) - test_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, - ctx, args.prefix, args.epoch, - args.vis, args.shuffle, args.has_rpn, args.proposal, args.thresh) - -if __name__ == '__main__': - main() diff --git a/example/rcnn/rcnn/tools/test_rpn.py b/example/rcnn/rcnn/tools/test_rpn.py deleted file mode 100644 index f2244a568d6a..000000000000 --- a/example/rcnn/rcnn/tools/test_rpn.py +++ /dev/null @@ -1,116 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import pprint -import mxnet as mx - -from ..logger import logger -from ..config import config, default, generate_config -from ..symbol import * -from ..dataset import * -from ..core.loader import TestLoader -from ..core.tester import Predictor, generate_proposals -from ..utils.load_model import load_param - - -def test_rpn(network, dataset, image_set, root_path, dataset_path, - ctx, prefix, epoch, - vis, shuffle, thresh): - # rpn generate proposal config - config.TEST.HAS_RPN = True - - # print config - pprint.pprint(config) - - # load symbol - sym = eval('get_' + network + '_rpn_test')(num_anchors=config.NUM_ANCHORS) - - # load dataset and prepare imdb for training - imdb = eval(dataset)(image_set, root_path, dataset_path) - roidb = imdb.gt_roidb() - test_data = TestLoader(roidb, batch_size=1, shuffle=shuffle, has_rpn=True) - - # load model - arg_params, aux_params = load_param(prefix, epoch, convert=True, ctx=ctx) - - # infer shape - data_shape_dict = dict(test_data.provide_data) - arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict) - arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) - aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) - - # check parameters - for k in sym.list_arguments(): - if k in data_shape_dict or 'label' in k: - continue - assert k in arg_params, k + ' not initialized' - assert arg_params[k].shape == arg_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) - for k in sym.list_auxiliary_states(): - assert k in aux_params, k + ' not initialized' - assert aux_params[k].shape == aux_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) - - # decide maximum shape - data_names = [k[0] for k in test_data.provide_data] - label_names = None if test_data.provide_label is None else [k[0] for k in test_data.provide_label] - max_data_shape = [('data', (1, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] - - # create predictor - predictor = Predictor(sym, data_names, label_names, - context=ctx, max_data_shapes=max_data_shape, - provide_data=test_data.provide_data, provide_label=test_data.provide_label, - arg_params=arg_params, aux_params=aux_params) - - # start testing - imdb_boxes = generate_proposals(predictor, test_data, imdb, vis=vis, thresh=thresh) - imdb.evaluate_recall(roidb, candidate_boxes=imdb_boxes) - - -def parse_args(): - parser = argparse.ArgumentParser(description='Test a Region Proposal Network') - # general - parser.add_argument('--network', help='network name', default=default.network, type=str) - parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) - args, rest = parser.parse_known_args() - generate_config(args.network, args.dataset) - parser.add_argument('--image_set', help='image_set name', default=default.test_image_set, type=str) - parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) - parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) - # testing - parser.add_argument('--prefix', help='model to test with', default=default.rpn_prefix, type=str) - parser.add_argument('--epoch', help='model to test with', default=default.rpn_epoch, type=int) - # rpn - parser.add_argument('--gpu', help='GPU device to test with', default=0, type=int) - parser.add_argument('--vis', help='turn on visualization', action='store_true') - parser.add_argument('--thresh', help='rpn proposal threshold', default=0, type=float) - parser.add_argument('--shuffle', help='shuffle data on visualization', action='store_true') - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - logger.info('Called with argument: %s' % args) - ctx = mx.gpu(args.gpu) - test_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, - ctx, args.prefix, args.epoch, - args.vis, args.shuffle, args.thresh) - -if __name__ == '__main__': - main() diff --git a/example/rcnn/rcnn/tools/train_rcnn.py b/example/rcnn/rcnn/tools/train_rcnn.py deleted file mode 100644 index 70ff7b7afed5..000000000000 --- a/example/rcnn/rcnn/tools/train_rcnn.py +++ /dev/null @@ -1,189 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import pprint -import mxnet as mx - -from ..logger import logger -from ..config import config, default, generate_config -from ..symbol import * -from ..core import callback, metric -from ..core.loader import ROIIter -from ..core.module import MutableModule -from ..processing.bbox_regression import add_bbox_regression_targets -from ..utils.load_data import load_proposal_roidb, merge_roidb, filter_roidb -from ..utils.load_model import load_param - - -def train_rcnn(network, dataset, image_set, root_path, dataset_path, - frequent, kvstore, work_load_list, no_flip, no_shuffle, resume, - ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, - train_shared, lr, lr_step, proposal): - # set up config - config.TRAIN.BATCH_IMAGES = 2 - config.TRAIN.BATCH_ROIS = 128 - if proposal == 'ss': - config.TRAIN.BG_THRESH_LO = 0.1 # reproduce Fast R-CNN - - # load symbol - sym = eval('get_' + network + '_rcnn')(num_classes=config.NUM_CLASSES) - - # setup multi-gpu - batch_size = len(ctx) - input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size - - # print config - logger.info(pprint.pformat(config)) - - # load dataset and prepare imdb for training - image_sets = [iset for iset in image_set.split('+')] - roidbs = [load_proposal_roidb(dataset, image_set, root_path, dataset_path, - proposal=proposal, append_gt=True, flip=not no_flip) - for image_set in image_sets] - roidb = merge_roidb(roidbs) - roidb = filter_roidb(roidb) - means, stds = add_bbox_regression_targets(roidb) - - # load training data - train_data = ROIIter(roidb, batch_size=input_batch_size, shuffle=not no_shuffle, - ctx=ctx, work_load_list=work_load_list, aspect_grouping=config.TRAIN.ASPECT_GROUPING) - - # infer max shape - max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] - logger.info('providing maximum shape %s' % max_data_shape) - - # infer shape - data_shape_dict = dict(train_data.provide_data + train_data.provide_label) - arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict) - arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) - out_shape_dict = dict(zip(sym.list_outputs(), out_shape)) - aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) - logger.info('output shape %s' % pprint.pformat(out_shape_dict)) - - # load and initialize params - if resume: - arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) - else: - arg_params, aux_params = load_param(pretrained, epoch, convert=True) - arg_params['cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['cls_score_weight']) - arg_params['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias']) - arg_params['bbox_pred_weight'] = mx.random.normal(0, 0.001, shape=arg_shape_dict['bbox_pred_weight']) - arg_params['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias']) - - # check parameter shapes - for k in sym.list_arguments(): - if k in data_shape_dict: - continue - assert k in arg_params, k + ' not initialized' - assert arg_params[k].shape == arg_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) - for k in sym.list_auxiliary_states(): - assert k in aux_params, k + ' not initialized' - assert aux_params[k].shape == aux_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) - - # prepare training - # create solver - data_names = [k[0] for k in train_data.provide_data] - label_names = [k[0] for k in train_data.provide_label] - if train_shared: - fixed_param_prefix = config.FIXED_PARAMS_SHARED - else: - fixed_param_prefix = config.FIXED_PARAMS - mod = MutableModule(sym, data_names=data_names, label_names=label_names, - logger=logger, context=ctx, work_load_list=work_load_list, - max_data_shapes=max_data_shape, fixed_param_prefix=fixed_param_prefix) - - # decide training params - # metric - eval_metric = metric.RCNNAccMetric() - cls_metric = metric.RCNNLogLossMetric() - bbox_metric = metric.RCNNL1LossMetric() - eval_metrics = mx.metric.CompositeEvalMetric() - for child_metric in [eval_metric, cls_metric, bbox_metric]: - eval_metrics.add(child_metric) - # callback - batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent, auto_reset=False) - epoch_end_callback = callback.do_checkpoint(prefix, means, stds) - # decide learning rate - base_lr = lr - lr_factor = 0.1 - lr_epoch = [int(epoch) for epoch in lr_step.split(',')] - lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] - lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) - lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] - logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) - lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) - # optimizer - optimizer_params = {'momentum': 0.9, - 'wd': 0.0005, - 'learning_rate': lr, - 'lr_scheduler': lr_scheduler, - 'rescale_grad': (1.0 / batch_size), - 'clip_gradient': 5} - - # train - mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, - batch_end_callback=batch_end_callback, kvstore=kvstore, - optimizer='sgd', optimizer_params=optimizer_params, - arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) - - -def parse_args(): - parser = argparse.ArgumentParser(description='Train a Fast R-CNN Network') - # general - parser.add_argument('--network', help='network name', default=default.network, type=str) - parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) - args, rest = parser.parse_known_args() - generate_config(args.network, args.dataset) - parser.add_argument('--image_set', help='image_set name', default=default.image_set, type=str) - parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) - parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) - # training - parser.add_argument('--frequent', help='frequency of logging', default=default.frequent, type=int) - parser.add_argument('--kvstore', help='the kv-store type', default=default.kvstore, type=str) - parser.add_argument('--work_load_list', help='work load for different devices', default=None, type=list) - parser.add_argument('--no_flip', help='disable flip images', action='store_true') - parser.add_argument('--no_shuffle', help='disable random shuffle', action='store_true') - parser.add_argument('--resume', help='continue training', action='store_true') - # rcnn - parser.add_argument('--gpus', help='GPU device to train with', default='0', type=str) - parser.add_argument('--pretrained', help='pretrained model prefix', default=default.pretrained, type=str) - parser.add_argument('--pretrained_epoch', help='pretrained model epoch', default=default.pretrained_epoch, type=int) - parser.add_argument('--prefix', help='new model prefix', default=default.rcnn_prefix, type=str) - parser.add_argument('--begin_epoch', help='begin epoch of training', default=0, type=int) - parser.add_argument('--end_epoch', help='end epoch of training', default=default.rcnn_epoch, type=int) - parser.add_argument('--lr', help='base learning rate', default=default.rcnn_lr, type=float) - parser.add_argument('--lr_step', help='learning rate steps (in epoch)', default=default.rcnn_lr_step, type=str) - parser.add_argument('--train_shared', help='second round train shared params', action='store_true') - parser.add_argument('--proposal', help='can be ss for selective search or rpn', default='rpn', type=str) - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - logger.info('Called with argument: %s' % args) - ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] - train_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, - args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume, - ctx, args.pretrained, args.pretrained_epoch, args.prefix, args.begin_epoch, args.end_epoch, - train_shared=args.train_shared, lr=args.lr, lr_step=args.lr_step, proposal=args.proposal) - -if __name__ == '__main__': - main() diff --git a/example/rcnn/rcnn/tools/train_rpn.py b/example/rcnn/rcnn/tools/train_rpn.py deleted file mode 100644 index f3c1104ee376..000000000000 --- a/example/rcnn/rcnn/tools/train_rpn.py +++ /dev/null @@ -1,189 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import pprint -import mxnet as mx - -from ..logger import logger -from ..config import config, default, generate_config -from ..symbol import * -from ..core import callback, metric -from ..core.loader import AnchorLoader -from ..core.module import MutableModule -from ..utils.load_data import load_gt_roidb, merge_roidb, filter_roidb -from ..utils.load_model import load_param - - -def train_rpn(network, dataset, image_set, root_path, dataset_path, - frequent, kvstore, work_load_list, no_flip, no_shuffle, resume, - ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, - train_shared, lr, lr_step): - # setup config - config.TRAIN.BATCH_IMAGES = 1 - - # load symbol - sym = eval('get_' + network + '_rpn')(num_anchors=config.NUM_ANCHORS) - feat_sym = sym.get_internals()['rpn_cls_score_output'] - - # setup multi-gpu - batch_size = len(ctx) - input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size - - # print config - logger.info(pprint.pformat(config)) - - # load dataset and prepare imdb for training - image_sets = [iset for iset in image_set.split('+')] - roidbs = [load_gt_roidb(dataset, image_set, root_path, dataset_path, - flip=not no_flip) - for image_set in image_sets] - roidb = merge_roidb(roidbs) - roidb = filter_roidb(roidb) - - # load training data - train_data = AnchorLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not no_shuffle, - ctx=ctx, work_load_list=work_load_list, - feat_stride=config.RPN_FEAT_STRIDE, anchor_scales=config.ANCHOR_SCALES, - anchor_ratios=config.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING) - - # infer max shape - max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] - max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) - logger.info('providing maximum shape %s %s' % (max_data_shape, max_label_shape)) - - # infer shape - data_shape_dict = dict(train_data.provide_data + train_data.provide_label) - arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict) - arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) - out_shape_dict = dict(zip(sym.list_outputs(), out_shape)) - aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) - logger.info('output shape %s' % pprint.pformat(out_shape_dict)) - - # load and initialize params - if resume: - arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) - else: - arg_params, aux_params = load_param(pretrained, epoch, convert=True) - arg_params['rpn_conv_3x3_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_conv_3x3_weight']) - arg_params['rpn_conv_3x3_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_conv_3x3_bias']) - arg_params['rpn_cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_cls_score_weight']) - arg_params['rpn_cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_cls_score_bias']) - arg_params['rpn_bbox_pred_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_bbox_pred_weight']) - arg_params['rpn_bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_bbox_pred_bias']) - - # check parameter shapes - for k in sym.list_arguments(): - if k in data_shape_dict: - continue - assert k in arg_params, k + ' not initialized' - assert arg_params[k].shape == arg_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) - for k in sym.list_auxiliary_states(): - assert k in aux_params, k + ' not initialized' - assert aux_params[k].shape == aux_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) - - # create solver - data_names = [k[0] for k in train_data.provide_data] - label_names = [k[0] for k in train_data.provide_label] - if train_shared: - fixed_param_prefix = config.FIXED_PARAMS_SHARED - else: - fixed_param_prefix = config.FIXED_PARAMS - mod = MutableModule(sym, data_names=data_names, label_names=label_names, - logger=logger, context=ctx, work_load_list=work_load_list, - max_data_shapes=max_data_shape, max_label_shapes=max_label_shape, - fixed_param_prefix=fixed_param_prefix) - - # decide training params - # metric - eval_metric = metric.RPNAccMetric() - cls_metric = metric.RPNLogLossMetric() - bbox_metric = metric.RPNL1LossMetric() - eval_metrics = mx.metric.CompositeEvalMetric() - for child_metric in [eval_metric, cls_metric, bbox_metric]: - eval_metrics.add(child_metric) - # callback - batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent, auto_reset=False) - epoch_end_callback = mx.callback.do_checkpoint(prefix) - # decide learning rate - base_lr = lr - lr_factor = 0.1 - lr_epoch = [int(epoch) for epoch in lr_step.split(',')] - lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] - lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) - lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] - logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) - lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) - # optimizer - optimizer_params = {'momentum': 0.9, - 'wd': 0.0005, - 'learning_rate': lr, - 'lr_scheduler': lr_scheduler, - 'rescale_grad': (1.0 / batch_size), - 'clip_gradient': 5} - - # train - mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, - batch_end_callback=batch_end_callback, kvstore=kvstore, - optimizer='sgd', optimizer_params=optimizer_params, - arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) - - -def parse_args(): - parser = argparse.ArgumentParser(description='Train a Region Proposal Network') - # general - parser.add_argument('--network', help='network name', default=default.network, type=str) - parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) - args, rest = parser.parse_known_args() - generate_config(args.network, args.dataset) - parser.add_argument('--image_set', help='image_set name', default=default.image_set, type=str) - parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) - parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) - # training - parser.add_argument('--frequent', help='frequency of logging', default=default.frequent, type=int) - parser.add_argument('--kvstore', help='the kv-store type', default=default.kvstore, type=str) - parser.add_argument('--work_load_list', help='work load for different devices', default=None, type=list) - parser.add_argument('--no_flip', help='disable flip images', action='store_true') - parser.add_argument('--no_shuffle', help='disable random shuffle', action='store_true') - parser.add_argument('--resume', help='continue training', action='store_true') - # rpn - parser.add_argument('--gpus', help='GPU device to train with', default='0', type=str) - parser.add_argument('--pretrained', help='pretrained model prefix', default=default.pretrained, type=str) - parser.add_argument('--pretrained_epoch', help='pretrained model epoch', default=default.pretrained_epoch, type=int) - parser.add_argument('--prefix', help='new model prefix', default=default.rpn_prefix, type=str) - parser.add_argument('--begin_epoch', help='begin epoch of training', default=0, type=int) - parser.add_argument('--end_epoch', help='end epoch of training', default=default.rpn_epoch, type=int) - parser.add_argument('--lr', help='base learning rate', default=default.rpn_lr, type=float) - parser.add_argument('--lr_step', help='learning rate steps (in epoch)', default=default.rpn_lr_step, type=str) - parser.add_argument('--train_shared', help='second round train shared params', action='store_true') - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - logger.info('Called with argument: %s' % args) - ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] - train_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, - args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume, - ctx, args.pretrained, args.pretrained_epoch, args.prefix, args.begin_epoch, args.end_epoch, - train_shared=args.train_shared, lr=args.lr, lr_step=args.lr_step) - -if __name__ == '__main__': - main() diff --git a/example/rcnn/rcnn/utils/__init__.py b/example/rcnn/rcnn/utils/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/example/rcnn/rcnn/utils/combine_model.py b/example/rcnn/rcnn/utils/combine_model.py deleted file mode 100644 index eabe937be20c..000000000000 --- a/example/rcnn/rcnn/utils/combine_model.py +++ /dev/null @@ -1,39 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from load_model import load_checkpoint -from save_model import save_checkpoint - - -def combine_model(prefix1, epoch1, prefix2, epoch2, prefix_out, epoch_out): - args1, auxs1 = load_checkpoint(prefix1, epoch1) - args2, auxs2 = load_checkpoint(prefix2, epoch2) - arg_names = args1.keys() + args2.keys() - aux_names = auxs1.keys() + auxs2.keys() - args = dict() - for arg in arg_names: - if arg in args1: - args[arg] = args1[arg] - else: - args[arg] = args2[arg] - auxs = dict() - for aux in aux_names: - if aux in auxs1: - auxs[aux] = auxs1[aux] - else: - auxs[aux] = auxs2[aux] - save_checkpoint(prefix_out, epoch_out, args, auxs) diff --git a/example/rcnn/rcnn/utils/load_data.py b/example/rcnn/rcnn/utils/load_data.py deleted file mode 100644 index 816b3b3a405e..000000000000 --- a/example/rcnn/rcnn/utils/load_data.py +++ /dev/null @@ -1,69 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import numpy as np -from ..logger import logger -from ..config import config -from ..dataset import * - - -def load_gt_roidb(dataset_name, image_set_name, root_path, dataset_path, - flip=False): - """ load ground truth roidb """ - imdb = eval(dataset_name)(image_set_name, root_path, dataset_path) - roidb = imdb.gt_roidb() - if flip: - roidb = imdb.append_flipped_images(roidb) - return roidb - - -def load_proposal_roidb(dataset_name, image_set_name, root_path, dataset_path, - proposal='rpn', append_gt=True, flip=False): - """ load proposal roidb (append_gt when training) """ - imdb = eval(dataset_name)(image_set_name, root_path, dataset_path) - gt_roidb = imdb.gt_roidb() - roidb = eval('imdb.' + proposal + '_roidb')(gt_roidb, append_gt) - if flip: - roidb = imdb.append_flipped_images(roidb) - return roidb - - -def merge_roidb(roidbs): - """ roidb are list, concat them together """ - roidb = roidbs[0] - for r in roidbs[1:]: - roidb.extend(r) - return roidb - - -def filter_roidb(roidb): - """ remove roidb entries without usable rois """ - - def is_valid(entry): - """ valid images have at least 1 fg or bg roi """ - overlaps = entry['max_overlaps'] - fg_inds = np.where(overlaps >= config.TRAIN.FG_THRESH)[0] - bg_inds = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0] - valid = len(fg_inds) > 0 or len(bg_inds) > 0 - return valid - - num = len(roidb) - filtered_roidb = [entry for entry in roidb if is_valid(entry)] - num_after = len(filtered_roidb) - logger.info('load data: filtered %d roidb entries: %d -> %d' % (num - num_after, num, num_after)) - - return filtered_roidb diff --git a/example/rcnn/rcnn/utils/load_model.py b/example/rcnn/rcnn/utils/load_model.py deleted file mode 100644 index 0dc0752600c4..000000000000 --- a/example/rcnn/rcnn/utils/load_model.py +++ /dev/null @@ -1,76 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import mxnet as mx - - -def load_checkpoint(prefix, epoch): - """ - Load model checkpoint from file. - :param prefix: Prefix of model name. - :param epoch: Epoch number of model we would like to load. - :return: (arg_params, aux_params) - arg_params : dict of str to NDArray - Model parameter, dict of name to NDArray of net's weights. - aux_params : dict of str to NDArray - Model parameter, dict of name to NDArray of net's auxiliary states. - """ - save_dict = mx.nd.load('%s-%04d.params' % (prefix, epoch)) - arg_params = {} - aux_params = {} - for k, v in save_dict.items(): - tp, name = k.split(':', 1) - if tp == 'arg': - arg_params[name] = v - if tp == 'aux': - aux_params[name] = v - return arg_params, aux_params - - -def convert_context(params, ctx): - """ - :param params: dict of str to NDArray - :param ctx: the context to convert to - :return: dict of str of NDArray with context ctx - """ - new_params = dict() - for k, v in params.items(): - new_params[k] = v.as_in_context(ctx) - return new_params - - -def load_param(prefix, epoch, convert=False, ctx=None, process=False): - """ - wrapper for load checkpoint - :param prefix: Prefix of model name. - :param epoch: Epoch number of model we would like to load. - :param convert: reference model should be converted to GPU NDArray first - :param ctx: if convert then ctx must be designated. - :param process: model should drop any test - :return: (arg_params, aux_params) - """ - arg_params, aux_params = load_checkpoint(prefix, epoch) - if convert: - if ctx is None: - ctx = mx.cpu() - arg_params = convert_context(arg_params, ctx) - aux_params = convert_context(aux_params, ctx) - if process: - tests = [k for k in arg_params.keys() if '_test' in k] - for test in tests: - arg_params[test.replace('_test', '')] = arg_params.pop(test) - return arg_params, aux_params diff --git a/example/rcnn/rcnn/utils/save_model.py b/example/rcnn/rcnn/utils/save_model.py deleted file mode 100644 index f27fb61b0f7a..000000000000 --- a/example/rcnn/rcnn/utils/save_model.py +++ /dev/null @@ -1,35 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import mxnet as mx - - -def save_checkpoint(prefix, epoch, arg_params, aux_params): - """Checkpoint the model data into file. - :param prefix: Prefix of model name. - :param epoch: The epoch number of the model. - :param arg_params: dict of str to NDArray - Model parameter, dict of name to NDArray of net's weights. - :param aux_params: dict of str to NDArray - Model parameter, dict of name to NDArray of net's auxiliary states. - :return: None - prefix-epoch.params will be saved for parameters. - """ - save_dict = {('arg:%s' % k) : v for k, v in arg_params.items()} - save_dict.update({('aux:%s' % k) : v for k, v in aux_params.items()}) - param_name = '%s-%04d.params' % (prefix, epoch) - mx.nd.save(param_name, save_dict) diff --git a/example/rcnn/script/additional_deps.sh b/example/rcnn/script/additional_deps.sh deleted file mode 100755 index cddc391b13ed..000000000000 --- a/example/rcnn/script/additional_deps.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# install additional depts -sudo apt install python-pip python-dev unzip python-matplotlib -sudo pip install cython scikit-image easydict opencv-python - -# build cython extension -make diff --git a/example/rcnn/script/get_coco.sh b/example/rcnn/script/get_coco.sh deleted file mode 100755 index a2f8f90e8a6d..000000000000 --- a/example/rcnn/script/get_coco.sh +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# make a data folder -if ! [ -e data ] -then - mkdir data -fi - -pushd data - -# download images -mkdir images -declare -a filenames=("train2014" "val2014") -for i in "${filenames[@]}" -do - if ! [ -e $i.zip ] - then - echo $i.zip "not found, downloading" - wget http://msvocds.blob.core.windows.net/coco2014/$i.zip - fi - unzip $i.zip - echo $i/*.jpg | mv -t images - rm -r $i -done - -# download annotations -anno="instances_train-val2014.zip" -if ! [ -e $anno ] -then - echo $anno "not found, downloading" - wget http://msvocds.blob.core.windows.net/annotations-1-0-3/$anno -fi -unzip $anno - -# the result is coco/images/ coco/annotations/ -mkdir coco -mv images coco -mv annotations coco - -popd diff --git a/example/rcnn/script/get_pretrained_model.sh b/example/rcnn/script/get_pretrained_model.sh deleted file mode 100755 index 746be0b7ddd8..000000000000 --- a/example/rcnn/script/get_pretrained_model.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# make a model folder -if ! [ -e model ] -then - mkdir model -fi - -# download pretrained model -pushd model -wget http://data.dmlc.ml/mxnet/models/imagenet/vgg/vgg16-0000.params -wget http://data.dmlc.ml/mxnet/models/imagenet/resnet/101-layers/resnet-101-0000.params -popd diff --git a/example/rcnn/script/get_selective_search.sh b/example/rcnn/script/get_selective_search.sh deleted file mode 100755 index 487c653b23a7..000000000000 --- a/example/rcnn/script/get_selective_search.sh +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# make a data folder -if ! [ -e data ] -then - mkdir data -fi - -pushd data - -# the result is selective_search_data -wget http://www.cs.berkeley.edu/~rbg/fast-rcnn-data/selective_search_data.tgz -tar xf selective_search_data.tgz - -popd diff --git a/example/rcnn/script/get_voc.sh b/example/rcnn/script/get_voc.sh deleted file mode 100755 index 060b79336619..000000000000 --- a/example/rcnn/script/get_voc.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# make a data folder -if ! [ -e data ] -then - mkdir data -fi - -pushd data - -# the result is VOCdevkit/VOC2007 -declare -a arr=("VOCtrainval_06-Nov-2007.tar" "VOCtest_06-Nov-2007.tar") -for i in "${arr[@]}" -do - if ! [ -e $i ] - then - echo $i "not found, downloading" - wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/$i - fi - tar -xf $i -done - -# the result is VOCdevkit/VOC2012 -voc2012="VOCtrainval_11-May-2012.tar" -if ! [ -e $voc2012 ] -then - echo $voc2012 "not found, downloading" - wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/$voc2012 -fi -tar -xf $voc2012 - -popd diff --git a/example/rcnn/script/resnet_voc07.sh b/example/rcnn/script/resnet_voc07.sh deleted file mode 100755 index 3cb421f016c0..000000000000 --- a/example/rcnn/script/resnet_voc07.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# run this experiment with -# nohup bash script/resnet_voc07.sh 0,1 &> resnet_voc07.log & -# to use gpu 0,1 to train, gpu 0 to test and write logs to resnet_voc07.log -gpu=${1:0:1} - -export MXNET_CUDNN_AUTOTUNE_DEFAULT=0 -export PYTHONUNBUFFERED=1 - -python train_end2end.py --network resnet --gpu $1 -python test.py --network resnet --gpu $gpu diff --git a/example/rcnn/script/resnet_voc0712.sh b/example/rcnn/script/resnet_voc0712.sh deleted file mode 100755 index aa2bd39499a4..000000000000 --- a/example/rcnn/script/resnet_voc0712.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# run this experiment with -# nohup bash script/resnet_voc00712.sh 0,1 &> resnet_voc0712.log & -# to use gpu 0,1 to train, gpu 0 to test and write logs to resnet_voc0712.log -gpu=${1:0:1} - -export MXNET_CUDNN_AUTOTUNE_DEFAULT=0 -export PYTHONUNBUFFERED=1 - -python train_end2end.py --network resnet --image_set 2007_trainval+2012_trainval --gpu $1 -python test.py --network resnet --gpu $gpu diff --git a/example/rcnn/script/vgg_alter_voc07.sh b/example/rcnn/script/vgg_alter_voc07.sh deleted file mode 100755 index 72ee0cddea2f..000000000000 --- a/example/rcnn/script/vgg_alter_voc07.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# run this experiment with -# nohup bash script/vgg_alter_voc07.sh 0,1 &> vgg_voc07.log & -# to use gpu 0,1 to train, gpu 0 to test and write logs to vgg_voc07.log -gpu=${1:0:1} - -export MXNET_CUDNN_AUTOTUNE_DEFAULT=0 -export PYTHONUNBUFFERED=1 - -python train_alternate.py --gpu $1 -python test.py --prefix model/final --epoch 0 --gpu $gpu diff --git a/example/rcnn/script/vgg_fast_rcnn.sh b/example/rcnn/script/vgg_fast_rcnn.sh deleted file mode 100755 index cafd2ea66b3e..000000000000 --- a/example/rcnn/script/vgg_fast_rcnn.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# run this experiment with -# nohup bash script/vgg_fast_rcnn.sh 0,1 &> vgg_fast_rcnn.log & -# to use gpu 0,1 to train, gpu 0 to test and write logs to vgg_fast_rcnn.log -gpu=${1:0:1} - -export MXNET_CUDNN_AUTOTUNE_DEFAULT=0 -export PYTHONUNBUFFERED=1 - -python -m rcnn.tools.train_rcnn --proposal selective_search --gpu $1 -python -m rcnn.tools.test_rcnn --proposal selective_search --gpu $gpu diff --git a/example/rcnn/script/vgg_voc07.sh b/example/rcnn/script/vgg_voc07.sh deleted file mode 100755 index 22249e153838..000000000000 --- a/example/rcnn/script/vgg_voc07.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# run this experiment with -# nohup bash script/vgg_voc07.sh 0,1 &> vgg_voc07.log & -# to use gpu 0,1 to train, gpu 0 to test and write logs to vgg_voc07.log -gpu=${1:0:1} - -export MXNET_CUDNN_AUTOTUNE_DEFAULT=0 -export PYTHONUNBUFFERED=1 - -python train_end2end.py --gpu $1 -python test.py --gpu $gpu diff --git a/example/rcnn/script/vgg_voc0712.sh b/example/rcnn/script/vgg_voc0712.sh deleted file mode 100755 index 22416dad4878..000000000000 --- a/example/rcnn/script/vgg_voc0712.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - - -# run this experiment with -# nohup bash script/vgg_voc00712.sh 0,1 &> vgg_voc0712.log & -# to use gpu 0,1 to train, gpu 0 to test and write logs to vgg_voc0712.log -gpu=${1:0:1} - -export MXNET_CUDNN_AUTOTUNE_DEFAULT=0 -export PYTHONUNBUFFERED=1 - -python train_end2end.py --image_set 2007_trainval+2012_trainval --gpu $1 -python test.py --gpu $gpu diff --git a/example/rcnn/rcnn/__init__.py b/example/rcnn/symdata/__init__.py similarity index 100% rename from example/rcnn/rcnn/__init__.py rename to example/rcnn/symdata/__init__.py diff --git a/example/rcnn/symdata/anchor.py b/example/rcnn/symdata/anchor.py new file mode 100644 index 000000000000..f21f1a178d5a --- /dev/null +++ b/example/rcnn/symdata/anchor.py @@ -0,0 +1,176 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +from symdata.bbox import bbox_overlaps, bbox_transform + + +class AnchorGenerator: + def __init__(self, feat_stride=16, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)): + self._num_anchors = len(anchor_scales) * len(anchor_ratios) + self._feat_stride = feat_stride + self._base_anchors = self._generate_base_anchors(feat_stride, np.array(anchor_scales), np.array(anchor_ratios)) + + def generate(self, feat_height, feat_width): + shift_x = np.arange(0, feat_width) * self._feat_stride + shift_y = np.arange(0, feat_height) * self._feat_stride + shift_x, shift_y = np.meshgrid(shift_x, shift_y) + shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() + # add A anchors (1, A, 4) to + # cell K shifts (K, 1, 4) to get + # shift anchors (K, A, 4) + # reshape to (K*A, 4) shifted anchors + A = self._num_anchors + K = shifts.shape[0] + all_anchors = self._base_anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)) + all_anchors = all_anchors.reshape((K * A, 4)) + return all_anchors + + @staticmethod + def _generate_base_anchors(base_size, scales, ratios): + """ + Generate anchor (reference) windows by enumerating aspect ratios X + scales wrt a reference (0, 0, 15, 15) window. + """ + base_anchor = np.array([1, 1, base_size, base_size]) - 1 + ratio_anchors = AnchorGenerator._ratio_enum(base_anchor, ratios) + anchors = np.vstack([AnchorGenerator._scale_enum(ratio_anchors[i, :], scales) + for i in range(ratio_anchors.shape[0])]) + return anchors + + @staticmethod + def _whctrs(anchor): + """ + Return width, height, x center, and y center for an anchor (window). + """ + w = anchor[2] - anchor[0] + 1 + h = anchor[3] - anchor[1] + 1 + x_ctr = anchor[0] + 0.5 * (w - 1) + y_ctr = anchor[1] + 0.5 * (h - 1) + return w, h, x_ctr, y_ctr + + @staticmethod + def _mkanchors(ws, hs, x_ctr, y_ctr): + """ + Given a vector of widths (ws) and heights (hs) around a center + (x_ctr, y_ctr), output a set of anchors (windows). + """ + ws = ws[:, np.newaxis] + hs = hs[:, np.newaxis] + anchors = np.hstack((x_ctr - 0.5 * (ws - 1), + y_ctr - 0.5 * (hs - 1), + x_ctr + 0.5 * (ws - 1), + y_ctr + 0.5 * (hs - 1))) + return anchors + + @staticmethod + def _ratio_enum(anchor, ratios): + """ + Enumerate a set of anchors for each aspect ratio wrt an anchor. + """ + w, h, x_ctr, y_ctr = AnchorGenerator._whctrs(anchor) + size = w * h + size_ratios = size / ratios + ws = np.round(np.sqrt(size_ratios)) + hs = np.round(ws * ratios) + anchors = AnchorGenerator._mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + + @staticmethod + def _scale_enum(anchor, scales): + """ + Enumerate a set of anchors for each scale wrt an anchor. + """ + w, h, x_ctr, y_ctr = AnchorGenerator._whctrs(anchor) + ws = w * scales + hs = h * scales + anchors = AnchorGenerator._mkanchors(ws, hs, x_ctr, y_ctr) + return anchors + + +class AnchorSampler: + def __init__(self, allowed_border=0, batch_rois=256, fg_fraction=0.5, fg_overlap=0.7, bg_overlap=0.3): + self._allowed_border = allowed_border + self._num_batch = batch_rois + self._num_fg = int(batch_rois * fg_fraction) + self._fg_overlap = fg_overlap + self._bg_overlap = bg_overlap + + def assign(self, anchors, gt_boxes, im_height, im_width): + num_anchors = anchors.shape[0] + + # filter out padded gt_boxes + valid_labels = np.where(gt_boxes[:, -1] > 0)[0] + gt_boxes = gt_boxes[valid_labels] + + # filter out anchors outside the region + inds_inside = np.where((anchors[:, 0] >= -self._allowed_border) & + (anchors[:, 2] < im_width + self._allowed_border) & + (anchors[:, 1] >= -self._allowed_border) & + (anchors[:, 3] < im_height + self._allowed_border))[0] + anchors = anchors[inds_inside, :] + num_valid = len(inds_inside) + + # label: 1 is positive, 0 is negative, -1 is dont care + labels = np.ones((num_valid,), dtype=np.float32) * -1 + bbox_targets = np.zeros((num_valid, 4), dtype=np.float32) + bbox_weights = np.zeros((num_valid, 4), dtype=np.float32) + + # sample for positive labels + if gt_boxes.size > 0: + # overlap between the anchors and the gt boxes + # overlaps (ex, gt) + overlaps = bbox_overlaps(anchors.astype(np.float), gt_boxes.astype(np.float)) + gt_max_overlaps = overlaps.max(axis=0) + + # fg anchors: anchor with highest overlap for each gt; or overlap > iou thresh + fg_inds = np.where((overlaps >= self._fg_overlap) | (overlaps == gt_max_overlaps))[0] + + # subsample to num_fg + if len(fg_inds) > self._num_fg: + fg_inds = np.random.choice(fg_inds, size=self._num_fg, replace=False) + + # bg anchor: anchor with overlap < iou thresh but not highest overlap for some gt + bg_inds = np.where((overlaps < self._bg_overlap) & (overlaps < gt_max_overlaps))[0] + + if len(bg_inds) > self._num_batch - len(fg_inds): + bg_inds = np.random.choice(bg_inds, size=self._num_batch - len(fg_inds), replace=False) + + # assign label + labels[fg_inds] = 1 + labels[bg_inds] = 0 + + # assign to argmax overlap + argmax_overlaps = overlaps.argmax(axis=1) + bbox_targets[fg_inds, :] = bbox_transform(anchors[fg_inds, :], gt_boxes[argmax_overlaps[fg_inds], :], + box_stds=(1.0, 1.0, 1.0, 1.0)) + + # only fg anchors has bbox_targets + bbox_weights[fg_inds, :] = 1 + else: + # randomly draw bg anchors + bg_inds = np.random.choice(np.arange(num_valid), size=self._num_batch, replace=False) + labels[bg_inds] = 0 + + all_labels = np.ones((num_anchors,), dtype=np.float32) * -1 + all_labels[inds_inside] = labels + all_bbox_targets = np.zeros((num_anchors, 4), dtype=np.float32) + all_bbox_targets[inds_inside, :] = bbox_targets + all_bbox_weights = np.zeros((num_anchors, 4), dtype=np.float32) + all_bbox_weights[inds_inside, :] = bbox_weights + + return all_labels, all_bbox_targets, all_bbox_weights diff --git a/example/rcnn/rcnn/processing/bbox_transform.py b/example/rcnn/symdata/bbox.py similarity index 56% rename from example/rcnn/rcnn/processing/bbox_transform.py rename to example/rcnn/symdata/bbox.py index 04fa81feda78..f94c8c9546b9 100644 --- a/example/rcnn/rcnn/processing/bbox_transform.py +++ b/example/rcnn/symdata/bbox.py @@ -16,14 +16,26 @@ # under the License. import numpy as np -from ..cython.bbox import bbox_overlaps_cython -def bbox_overlaps(boxes, query_boxes): - return bbox_overlaps_cython(boxes, query_boxes) +def bbox_flip(bbox, width, flip_x=False): + """ + invalid value in bbox_transform if this wrong (no overlap), note index 0 and 2 + also note need to save before assignment + :param bbox: [n][x1, y1, x2, y2] + :param width: cv2 (height, width, channel) + :param flip_x: will flip x1 and x2 + :return: flipped box + """ + if flip_x: + xmax = width - bbox[:, 0] + xmin = width - bbox[:, 2] + bbox[:, 0] = xmin + bbox[:, 2] = xmax + return bbox -def bbox_overlaps_py(boxes, query_boxes): +def bbox_overlaps(boxes, query_boxes): """ determine overlaps between boxes and query_boxes :param boxes: n * 4 bounding boxes @@ -64,7 +76,7 @@ def clip_boxes(boxes, im_shape): return boxes -def nonlinear_transform(ex_rois, gt_rois): +def bbox_transform(ex_rois, gt_rois, box_stds): """ compute bounding box regression targets from ex_rois to gt_rois :param ex_rois: [N, 4] @@ -83,17 +95,16 @@ def nonlinear_transform(ex_rois, gt_rois): gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0) gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0) - targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-14) - targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-14) - targets_dw = np.log(gt_widths / ex_widths) - targets_dh = np.log(gt_heights / ex_heights) + targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-14) / box_stds[0] + targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-14) / box_stds[1] + targets_dw = np.log(gt_widths / ex_widths) / box_stds[2] + targets_dh = np.log(gt_heights / ex_heights) / box_stds[3] - targets = np.vstack( - (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() + targets = np.vstack((targets_dx, targets_dy, targets_dw, targets_dh)).transpose() return targets -def nonlinear_pred(boxes, box_deltas): +def bbox_pred(boxes, box_deltas, box_stds): """ Transform the set of class-agnostic boxes into class-specific boxes by applying the predicted offsets (box_deltas) @@ -104,16 +115,15 @@ def nonlinear_pred(boxes, box_deltas): if boxes.shape[0] == 0: return np.zeros((0, box_deltas.shape[1])) - boxes = boxes.astype(np.float, copy=False) widths = boxes[:, 2] - boxes[:, 0] + 1.0 heights = boxes[:, 3] - boxes[:, 1] + 1.0 ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0) ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0) - dx = box_deltas[:, 0::4] - dy = box_deltas[:, 1::4] - dw = box_deltas[:, 2::4] - dh = box_deltas[:, 3::4] + dx = box_deltas[:, 0::4] * box_stds[0] + dy = box_deltas[:, 1::4] * box_stds[1] + dw = box_deltas[:, 2::4] * box_stds[2] + dh = box_deltas[:, 3::4] * box_stds[3] pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] @@ -133,47 +143,72 @@ def nonlinear_pred(boxes, box_deltas): return pred_boxes -def iou_transform(ex_rois, gt_rois): - """ return bbox targets, IoU loss uses gt_rois as gt """ - assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number' - return gt_rois - - -def iou_pred(boxes, box_deltas): +def nms(dets, thresh): """ - Transform the set of class-agnostic boxes into class-specific boxes - by applying the predicted offsets (box_deltas) - :param boxes: !important [N 4] - :param box_deltas: [N, 4 * num_classes] - :return: [N 4 * num_classes] + greedily select boxes with high confidence and overlap with current maximum <= thresh + rule out overlap >= thresh + :param dets: [[x1, y1, x2, y2 score]] + :param thresh: retain overlap < thresh + :return: indexes to keep """ - if boxes.shape[0] == 0: - return np.zeros((0, box_deltas.shape[1])) - - boxes = boxes.astype(np.float, copy=False) - x1 = boxes[:, 0] - y1 = boxes[:, 1] - x2 = boxes[:, 2] - y2 = boxes[:, 3] - - dx1 = box_deltas[:, 0::4] - dy1 = box_deltas[:, 1::4] - dx2 = box_deltas[:, 2::4] - dy2 = box_deltas[:, 3::4] - - pred_boxes = np.zeros(box_deltas.shape) - # x1 - pred_boxes[:, 0::4] = dx1 + x1[:, np.newaxis] - # y1 - pred_boxes[:, 1::4] = dy1 + y1[:, np.newaxis] - # x2 - pred_boxes[:, 2::4] = dx2 + x2[:, np.newaxis] - # y2 - pred_boxes[:, 3::4] = dy2 + y2[:, np.newaxis] - - return pred_boxes - - -# define bbox_transform and bbox_pred -bbox_transform = nonlinear_transform -bbox_pred = nonlinear_pred + x1 = dets[:, 0] + y1 = dets[:, 1] + x2 = dets[:, 2] + y2 = dets[:, 3] + scores = dets[:, 4] + + areas = (x2 - x1 + 1) * (y2 - y1 + 1) + order = scores.argsort()[::-1] + + keep = [] + while order.size > 0: + i = order[0] + keep.append(i) + xx1 = np.maximum(x1[i], x1[order[1:]]) + yy1 = np.maximum(y1[i], y1[order[1:]]) + xx2 = np.minimum(x2[i], x2[order[1:]]) + yy2 = np.minimum(y2[i], y2[order[1:]]) + + w = np.maximum(0.0, xx2 - xx1 + 1) + h = np.maximum(0.0, yy2 - yy1 + 1) + inter = w * h + ovr = inter / (areas[i] + areas[order[1:]] - inter) + + inds = np.where(ovr <= thresh)[0] + order = order[inds + 1] + + return keep + + +def im_detect(rois, scores, bbox_deltas, im_info, + bbox_stds, nms_thresh, conf_thresh): + """rois (nroi, 4), scores (nrois, nclasses), bbox_deltas (nrois, 4 * nclasses), im_info (3)""" + rois = rois.asnumpy() + scores = scores.asnumpy() + bbox_deltas = bbox_deltas.asnumpy() + + im_info = im_info.asnumpy() + height, width, scale = im_info + + # post processing + pred_boxes = bbox_pred(rois, bbox_deltas, bbox_stds) + pred_boxes = clip_boxes(pred_boxes, (height, width)) + + # we used scaled image & roi to train, so it is necessary to transform them back + pred_boxes = pred_boxes / scale + + # convert to per class detection results + det = [] + for j in range(1, scores.shape[-1]): + indexes = np.where(scores[:, j] > conf_thresh)[0] + cls_scores = scores[indexes, j, np.newaxis] + cls_boxes = pred_boxes[indexes, j * 4:(j + 1) * 4] + cls_dets = np.hstack((cls_boxes, cls_scores)) + keep = nms(cls_dets, thresh=nms_thresh) + + cls_id = np.ones_like(cls_scores) * j + det.append(np.hstack((cls_id, cls_scores, cls_boxes))[keep, :]) + + # assemble all classes + det = np.concatenate(det, axis=0) + return det diff --git a/example/rcnn/symdata/image.py b/example/rcnn/symdata/image.py new file mode 100644 index 000000000000..156c01853a8e --- /dev/null +++ b/example/rcnn/symdata/image.py @@ -0,0 +1,144 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import numpy as np +import cv2 + + +def get_image(roi_rec, short, max_size, mean, std): + """ + read, resize, transform image, return im_tensor, im_info, gt_boxes + roi_rec should have keys: ["image", "boxes", "gt_classes", "flipped"] + 0 --- x (width, second dim of im) + | + y (height, first dim of im) + """ + im = imdecode(roi_rec['image']) + if roi_rec["flipped"]: + im = im[:, ::-1, :] + im, im_scale = resize(im, short, max_size) + height, width = im.shape[:2] + im_info = np.array([height, width, im_scale], dtype=np.float32) + im_tensor = transform(im, mean, std) + + # gt boxes: (x1, y1, x2, y2, cls) + if roi_rec['gt_classes'].size > 0: + gt_inds = np.where(roi_rec['gt_classes'] != 0)[0] + gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) + gt_boxes[:, 0:4] = roi_rec['boxes'][gt_inds, :] + gt_boxes[:, 4] = roi_rec['gt_classes'][gt_inds] + # scale gt_boxes + gt_boxes[:, 0:4] *= im_scale + else: + gt_boxes = np.empty((0, 5), dtype=np.float32) + + return im_tensor, im_info, gt_boxes + + +def imdecode(image_path): + """Return BGR image read by opencv""" + import os + assert os.path.exists(image_path), image_path + ' not found' + im = cv2.imread(image_path) + return im + + +def resize(im, short, max_size): + """ + only resize input image to target size and return scale + :param im: BGR image input by opencv + :param short: one dimensional size (the short side) + :param max_size: one dimensional max size (the long side) + :return: resized image (NDArray) and scale (float) + """ + im_shape = im.shape + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + im_scale = float(short) / float(im_size_min) + # prevent bigger axis from being more than max_size: + if np.round(im_scale * im_size_max) > max_size: + im_scale = float(max_size) / float(im_size_max) + im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) + return im, im_scale + + +def transform(im, mean, std): + """ + transform into mxnet tensor, + subtract pixel size and transform to correct format + :param im: [height, width, channel] in BGR + :param mean: [RGB pixel mean] + :param std: [RGB pixel std var] + :return: [batch, channel, height, width] + """ + im_tensor = np.zeros((3, im.shape[0], im.shape[1])) + for i in range(3): + im_tensor[i, :, :] = (im[:, :, 2 - i] - mean[i]) / std[i] + return im_tensor + + +def transform_inverse(im_tensor, mean, std): + """ + transform from mxnet im_tensor to ordinary RGB image + im_tensor is limited to one image + :param im_tensor: [batch, channel, height, width] + :param mean: [RGB pixel mean] + :param std: [RGB pixel std var] + :return: im [height, width, channel(RGB)] + """ + assert im_tensor.shape[0] == 3 + im = im_tensor.transpose((1, 2, 0)) + im = im * std + mean + im = im.astype(np.uint8) + return im + + +def tensor_vstack(tensor_list, pad=0): + """ + vertically stack tensors by adding a new axis + expand dims if only 1 tensor + :param tensor_list: list of tensor to be stacked vertically + :param pad: label to pad with + :return: tensor with max shape + """ + if len(tensor_list) == 1: + return tensor_list[0][np.newaxis, :] + + ndim = len(tensor_list[0].shape) + dimensions = [len(tensor_list)] # first dim is batch size + for dim in range(ndim): + dimensions.append(max([tensor.shape[dim] for tensor in tensor_list])) + + dtype = tensor_list[0].dtype + if pad == 0: + all_tensor = np.zeros(tuple(dimensions), dtype=dtype) + elif pad == 1: + all_tensor = np.ones(tuple(dimensions), dtype=dtype) + else: + all_tensor = np.full(tuple(dimensions), pad, dtype=dtype) + if ndim == 1: + for ind, tensor in enumerate(tensor_list): + all_tensor[ind, :tensor.shape[0]] = tensor + elif ndim == 2: + for ind, tensor in enumerate(tensor_list): + all_tensor[ind, :tensor.shape[0], :tensor.shape[1]] = tensor + elif ndim == 3: + for ind, tensor in enumerate(tensor_list): + all_tensor[ind, :tensor.shape[0], :tensor.shape[1], :tensor.shape[2]] = tensor + else: + raise Exception('Sorry, unimplemented.') + return all_tensor diff --git a/example/rcnn/symdata/loader.py b/example/rcnn/symdata/loader.py new file mode 100644 index 000000000000..fc4409dfadb3 --- /dev/null +++ b/example/rcnn/symdata/loader.py @@ -0,0 +1,243 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +import numpy as np + +from symdata.anchor import AnchorGenerator, AnchorSampler +from symdata.image import imdecode, resize, transform, get_image, tensor_vstack + + +def load_test(filename, short, max_size, mean, std): + # read and transform image + im_orig = imdecode(filename) + im, im_scale = resize(im_orig, short, max_size) + height, width = im.shape[:2] + im_info = mx.nd.array([height, width, im_scale]) + + # transform into tensor and normalize + im_tensor = transform(im, mean, std) + + # for 1-batch inference purpose, cannot use batchify (or nd.stack) to expand dims + im_tensor = mx.nd.array(im_tensor).expand_dims(0) + im_info = mx.nd.array(im_info).expand_dims(0) + + # transform cv2 BRG image to RGB for matplotlib + im_orig = im_orig[:, :, (2, 1, 0)] + return im_tensor, im_info, im_orig + + +def generate_batch(im_tensor, im_info): + """return batch""" + data = [im_tensor, im_info] + data_shapes = [('data', im_tensor.shape), ('im_info', im_info.shape)] + data_batch = mx.io.DataBatch(data=data, label=None, provide_data=data_shapes, provide_label=None) + return data_batch + + +class TestLoader(mx.io.DataIter): + def __init__(self, roidb, batch_size, short, max_size, mean, std): + super(TestLoader, self).__init__() + + # save parameters as properties + self._roidb = roidb + self._batch_size = batch_size + self._short = short + self._max_size = max_size + self._mean = mean + self._std = std + + # infer properties from roidb + self._size = len(self._roidb) + self._index = np.arange(self._size) + + # decide data and label names (only for training) + self._data_name = ['data', 'im_info'] + self._label_name = None + + # status variable + self._cur = 0 + self._data = None + self._label = None + + # get first batch to fill in provide_data and provide_label + self.next() + self.reset() + + @property + def provide_data(self): + return [(k, v.shape) for k, v in zip(self._data_name, self._data)] + + @property + def provide_label(self): + return None + + def reset(self): + self._cur = 0 + + def iter_next(self): + return self._cur + self._batch_size <= self._size + + def next(self): + if self.iter_next(): + data_batch = mx.io.DataBatch(data=self.getdata(), label=self.getlabel(), + pad=self.getpad(), index=self.getindex(), + provide_data=self.provide_data, provide_label=self.provide_label) + self._cur += self._batch_size + return data_batch + else: + raise StopIteration + + def getdata(self): + indices = self.getindex() + im_tensor, im_info = [], [] + for index in indices: + roi_rec = self._roidb[index] + b_im_tensor, b_im_info, _ = get_image(roi_rec, self._short, self._max_size, self._mean, self._std) + im_tensor.append(b_im_tensor) + im_info.append(b_im_info) + im_tensor = mx.nd.array(tensor_vstack(im_tensor, pad=0)) + im_info = mx.nd.array(tensor_vstack(im_info, pad=0)) + self._data = im_tensor, im_info + return self._data + + def getlabel(self): + return None + + def getindex(self): + cur_from = self._cur + cur_to = min(cur_from + self._batch_size, self._size) + return np.arange(cur_from, cur_to) + + def getpad(self): + return max(self._cur + self.batch_size - self._size, 0) + + +class AnchorLoader(mx.io.DataIter): + def __init__(self, roidb, batch_size, short, max_size, mean, std, + feat_sym, anchor_generator: AnchorGenerator, anchor_sampler: AnchorSampler, + shuffle=False): + super(AnchorLoader, self).__init__() + + # save parameters as properties + self._roidb = roidb + self._batch_size = batch_size + self._short = short + self._max_size = max_size + self._mean = mean + self._std = std + self._feat_sym = feat_sym + self._ag = anchor_generator + self._as = anchor_sampler + self._shuffle = shuffle + + # infer properties from roidb + self._size = len(roidb) + self._index = np.arange(self._size) + + # decide data and label names + self._data_name = ['data', 'im_info', 'gt_boxes'] + self._label_name = ['label', 'bbox_target', 'bbox_weight'] + + # status variable + self._cur = 0 + self._data = None + self._label = None + + # get first batch to fill in provide_data and provide_label + self.next() + self.reset() + + @property + def provide_data(self): + return [(k, v.shape) for k, v in zip(self._data_name, self._data)] + + @property + def provide_label(self): + return [(k, v.shape) for k, v in zip(self._label_name, self._label)] + + def reset(self): + self._cur = 0 + if self._shuffle: + np.random.shuffle(self._index) + + def iter_next(self): + return self._cur + self._batch_size <= self._size + + def next(self): + if self.iter_next(): + data_batch = mx.io.DataBatch(data=self.getdata(), label=self.getlabel(), + pad=self.getpad(), index=self.getindex(), + provide_data=self.provide_data, provide_label=self.provide_label) + self._cur += self._batch_size + return data_batch + else: + raise StopIteration + + def getdata(self): + indices = self.getindex() + im_tensor, im_info, gt_boxes = [], [], [] + for index in indices: + roi_rec = self._roidb[index] + b_im_tensor, b_im_info, b_gt_boxes = get_image(roi_rec, self._short, self._max_size, self._mean, self._std) + im_tensor.append(b_im_tensor) + im_info.append(b_im_info) + gt_boxes.append(b_gt_boxes) + im_tensor = mx.nd.array(tensor_vstack(im_tensor, pad=0)) + im_info = mx.nd.array(tensor_vstack(im_info, pad=0)) + gt_boxes = mx.nd.array(tensor_vstack(gt_boxes, pad=-1)) + self._data = im_tensor, im_info, gt_boxes + return self._data + + def getlabel(self): + im_tensor, im_info, gt_boxes = self._data + + # all stacked image share same anchors + _, out_shape, _ = self._feat_sym.infer_shape(data=im_tensor.shape) + feat_height, feat_width = out_shape[0][-2:] + anchors = self._ag.generate(feat_height, feat_width) + + # assign anchor according to their real size encoded in im_info + label, bbox_target, bbox_weight = [], [], [] + for batch_ind in range(im_info.shape[0]): + b_im_info = im_info[batch_ind].asnumpy() + b_gt_boxes = gt_boxes[batch_ind].asnumpy() + b_im_height, b_im_width = b_im_info[:2] + + b_label, b_bbox_target, b_bbox_weight = self._as.assign(anchors, b_gt_boxes, b_im_height, b_im_width) + + b_label = b_label.reshape((feat_height, feat_width, -1)).transpose((2, 0, 1)).flatten() + b_bbox_target = b_bbox_target.reshape((feat_height, feat_width, -1)).transpose((2, 0, 1)) + b_bbox_weight = b_bbox_weight.reshape((feat_height, feat_width, -1)).transpose((2, 0, 1)) + + label.append(b_label) + bbox_target.append(b_bbox_target) + bbox_weight.append(b_bbox_weight) + + label = mx.nd.array(tensor_vstack(label, pad=-1)) + bbox_target = mx.nd.array(tensor_vstack(bbox_target, pad=0)) + bbox_weight = mx.nd.array(tensor_vstack(bbox_weight, pad=0)) + self._label = label, bbox_target, bbox_weight + return self._label + + def getindex(self): + cur_from = self._cur + cur_to = min(cur_from + self._batch_size, self._size) + return np.arange(cur_from, cur_to) + + def getpad(self): + return max(self._cur + self.batch_size - self._size, 0) diff --git a/example/rcnn/rcnn/dataset/ds_utils.py b/example/rcnn/symdata/vis.py similarity index 50% rename from example/rcnn/rcnn/dataset/ds_utils.py rename to example/rcnn/symdata/vis.py index 8f90e8d390e1..d9e80d882390 100644 --- a/example/rcnn/rcnn/dataset/ds_utils.py +++ b/example/rcnn/symdata/vis.py @@ -15,19 +15,18 @@ # specific language governing permissions and limitations # under the License. -import numpy as np - - -def unique_boxes(boxes, scale=1.0): - """ return indices of unique boxes """ - v = np.array([1, 1e3, 1e6, 1e9]) - hashes = np.round(boxes * scale).dot(v).astype(np.int) - _, index = np.unique(hashes, return_index=True) - return np.sort(index) - - -def filter_small_boxes(boxes, min_size): - w = boxes[:, 2] - boxes[:, 0] - h = boxes[:, 3] - boxes[:, 1] - keep = np.where((w >= min_size) & (h > min_size))[0] - return keep +def vis_detection(im_orig, detections, class_names, thresh=0.7): + """visualize [cls, conf, x1, y1, x2, y2]""" + import matplotlib.pyplot as plt + import random + plt.imshow(im_orig) + colors = [(random.random(), random.random(), random.random()) for _ in class_names] + for [cls, conf, x1, y1, x2, y2] in detections: + cls = int(cls) + if cls > 0 and conf > thresh: + rect = plt.Rectangle((x1, y1), x2 - x1, y2 - y1, + fill=False, edgecolor=colors[cls], linewidth=3.5) + plt.gca().add_patch(rect) + plt.gca().text(x1, y1 - 2, '{:s} {:.3f}'.format(class_names[cls], conf), + bbox=dict(facecolor=colors[cls], alpha=0.5), fontsize=12, color='white') + plt.show() diff --git a/example/rcnn/rcnn/core/__init__.py b/example/rcnn/symimdb/__init__.py similarity index 100% rename from example/rcnn/rcnn/core/__init__.py rename to example/rcnn/symimdb/__init__.py diff --git a/example/rcnn/rcnn/dataset/coco.py b/example/rcnn/symimdb/coco.py similarity index 52% rename from example/rcnn/rcnn/dataset/coco.py rename to example/rcnn/symimdb/coco.py index 1ec7567958aa..cb863ec6be4c 100644 --- a/example/rcnn/rcnn/dataset/coco.py +++ b/example/rcnn/symimdb/coco.py @@ -15,91 +15,67 @@ # specific language governing permissions and limitations # under the License. -try: - import cPickle as pickle -except ImportError: - import pickle -import cv2 import os import json import numpy as np from builtins import range -from ..logger import logger +from symnet.logger import logger from .imdb import IMDB # coco api -from ..pycocotools.coco import COCO -from ..pycocotools.cocoeval import COCOeval -from ..pycocotools import mask as COCOmask +from pycocotools.coco import COCO +from pycocotools.cocoeval import COCOeval class coco(IMDB): + classes = ['__background__', # always index 0 + 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', + 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', + 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', + 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', + 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', + 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', + 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', + 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', + 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', + 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', + 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', + 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', + 'scissors', 'teddy bear', 'hair drier', 'toothbrush'] + def __init__(self, image_set, root_path, data_path): """ fill basic information to initialize imdb - :param image_set: train2014, val2014, test2015 - :param root_path: 'data', will write 'rpn_data', 'cache' - :param data_path: 'data/coco' + :param image_set: train2017, val2017 + :param root_path: 'data', will write 'cache' + :param data_path: 'data/coco', load data and write results """ - super(coco, self).__init__('COCO', image_set, root_path, data_path) - self.root_path = root_path - self.data_path = data_path - self.coco = COCO(self._get_ann_file()) - - # deal with class names - cats = [cat['name'] for cat in self.coco.loadCats(self.coco.getCatIds())] - self.classes = ['__background__'] + cats - self.num_classes = len(self.classes) - self._class_to_ind = dict(zip(self.classes, range(self.num_classes))) - self._class_to_coco_ind = dict(zip(cats, self.coco.getCatIds())) - self._coco_ind_to_class_ind = dict([(self._class_to_coco_ind[cls], self._class_to_ind[cls]) - for cls in self.classes[1:]]) - - # load image file names - self.image_set_index = self._load_image_set_index() - self.num_images = len(self.image_set_index) + super(coco, self).__init__('coco_' + image_set, root_path) + # example: annotations/instances_train2017.json + self._anno_file = os.path.join(data_path, 'annotations', 'instances_' + image_set + '.json') + # example train2017/000000119993.jpg + self._image_file_tmpl = os.path.join(data_path, image_set, '{}') + # example detections_val2017_results.json + self._result_file = os.path.join(data_path, 'detections_{}_results.json'.format(image_set)) + # get roidb + self._roidb = self._get_cached('roidb', self._load_gt_roidb) logger.info('%s num_images %d' % (self.name, self.num_images)) - # deal with data name - view_map = {'minival2014': 'val2014', - 'valminusminival2014': 'val2014'} - self.data_name = view_map[image_set] if image_set in view_map else image_set - - def _get_ann_file(self): - """ self.data_path / annotations / instances_train2014.json """ - prefix = 'instances' if 'test' not in self.image_set else 'image_info' - return os.path.join(self.data_path, 'annotations', - prefix + '_' + self.image_set + '.json') - - def _load_image_set_index(self): - """ image id: int """ - image_ids = self.coco.getImgIds() - return image_ids - - def image_path_from_index(self, index): - """ example: images / train2014 / COCO_train2014_000000119993.jpg """ - filename = 'COCO_%s_%012d.jpg' % (self.data_name, index) - image_path = os.path.join(self.data_path, 'images', self.data_name, filename) - assert os.path.exists(image_path), 'Path does not exist: {}'.format(image_path) - return image_path - - def gt_roidb(self): - cache_file = os.path.join(self.cache_path, self.name + '_gt_roidb.pkl') - if os.path.exists(cache_file): - with open(cache_file, 'rb') as fid: - roidb = pickle.load(fid) - logger.info('%s gt roidb loaded from %s' % (self.name, cache_file)) - return roidb - - gt_roidb = [self._load_coco_annotation(index) for index in self.image_set_index] - with open(cache_file, 'wb') as fid: - pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL) - logger.info('%s wrote gt roidb to %s' % (self.name, cache_file)) - + def _load_gt_roidb(self): + _coco = COCO(self._anno_file) + # deal with class names + cats = [cat['name'] for cat in _coco.loadCats(_coco.getCatIds())] + class_to_coco_ind = dict(zip(cats, _coco.getCatIds())) + class_to_ind = dict(zip(self.classes, range(self.num_classes))) + coco_ind_to_class_ind = dict([(class_to_coco_ind[cls], class_to_ind[cls]) + for cls in self.classes[1:]]) + + image_ids = _coco.getImgIds() + gt_roidb = [self._load_annotation(_coco, coco_ind_to_class_ind, index) for index in image_ids] return gt_roidb - def _load_coco_annotation(self, index): + def _load_annotation(self, _coco, coco_ind_to_class_ind, index): """ coco ann: [u'segmentation', u'area', u'iscrowd', u'image_id', u'bbox', u'category_id', u'id'] iscrowd: @@ -110,12 +86,13 @@ def _load_coco_annotation(self, index): :param index: coco image id :return: roidb entry """ - im_ann = self.coco.loadImgs(index)[0] + im_ann = _coco.loadImgs(index)[0] + filename = self._image_file_tmpl.format(im_ann['file_name']) width = im_ann['width'] height = im_ann['height'] - annIds = self.coco.getAnnIds(imgIds=index, iscrowd=None) - objs = self.coco.loadAnns(annIds) + annIds = _coco.getAnnIds(imgIds=index, iscrowd=None) + objs = _coco.loadAnns(annIds) # sanitize bboxes valid_objs = [] @@ -132,60 +109,50 @@ def _load_coco_annotation(self, index): num_objs = len(objs) boxes = np.zeros((num_objs, 4), dtype=np.uint16) - gt_classes = np.zeros((num_objs), dtype=np.int32) - overlaps = np.zeros((num_objs, self.num_classes), dtype=np.float32) - + gt_classes = np.zeros((num_objs,), dtype=np.int32) for ix, obj in enumerate(objs): - cls = self._coco_ind_to_class_ind[obj['category_id']] + cls = coco_ind_to_class_ind[obj['category_id']] boxes[ix, :] = obj['clean_bbox'] gt_classes[ix] = cls - if obj['iscrowd']: - overlaps[ix, :] = -1.0 - else: - overlaps[ix, cls] = 1.0 - roi_rec = {'image': self.image_path_from_index(index), + roi_rec = {'index': index, + 'image': filename, 'height': height, 'width': width, 'boxes': boxes, 'gt_classes': gt_classes, - 'gt_overlaps': overlaps, - 'max_classes': overlaps.argmax(axis=1), - 'max_overlaps': overlaps.max(axis=1), 'flipped': False} return roi_rec - def evaluate_detections(self, detections): - """ detections_val2014_results.json """ - res_folder = os.path.join(self.cache_path, 'results') - if not os.path.exists(res_folder): - os.makedirs(res_folder) - res_file = os.path.join(res_folder, 'detections_%s_results.json' % self.image_set) - self._write_coco_results(detections, res_file) - if 'test' not in self.image_set: - self._do_python_eval(res_file, res_folder) - - def _write_coco_results(self, detections, res_file): + def _evaluate_detections(self, detections, **kargs): + _coco = COCO(self._anno_file) + self._write_coco_results(_coco, detections) + self._do_python_eval(_coco) + + def _write_coco_results(self, _coco, detections): """ example results [{"image_id": 42, "category_id": 18, "bbox": [258.15,41.29,348.26,243.78], "score": 0.236}, ...] """ + cats = [cat['name'] for cat in _coco.loadCats(_coco.getCatIds())] + class_to_coco_ind = dict(zip(cats, _coco.getCatIds())) results = [] for cls_ind, cls in enumerate(self.classes): if cls == '__background__': continue logger.info('collecting %s results (%d/%d)' % (cls, cls_ind, self.num_classes - 1)) - coco_cat_id = self._class_to_coco_ind[cls] + coco_cat_id = class_to_coco_ind[cls] results.extend(self._coco_results_one_category(detections[cls_ind], coco_cat_id)) - logger.info('writing results json to %s' % res_file) - with open(res_file, 'w') as f: + logger.info('writing results json to %s' % self._result_file) + with open(self._result_file, 'w') as f: json.dump(results, f, sort_keys=True, indent=4) def _coco_results_one_category(self, boxes, cat_id): results = [] - for im_ind, index in enumerate(self.image_set_index): + for im_ind, roi_rec in enumerate(self.roidb): + index = roi_rec['index'] dets = boxes[im_ind].astype(np.float) if len(dets) == 0: continue @@ -201,20 +168,14 @@ def _coco_results_one_category(self, boxes, cat_id): results.extend(result) return results - def _do_python_eval(self, res_file, res_folder): - ann_type = 'bbox' - coco_dt = self.coco.loadRes(res_file) - coco_eval = COCOeval(self.coco, coco_dt) - coco_eval.params.useSegm = (ann_type == 'segm') + def _do_python_eval(self, _coco): + coco_dt = _coco.loadRes(self._result_file) + coco_eval = COCOeval(_coco, coco_dt) + coco_eval.params.useSegm = False coco_eval.evaluate() coco_eval.accumulate() self._print_detection_metrics(coco_eval) - eval_file = os.path.join(res_folder, 'detections_%s_results.pkl' % self.image_set) - with open(eval_file, 'wb') as f: - pickle.dump(coco_eval, f, pickle.HIGHEST_PROTOCOL) - logger.info('eval results saved to %s' % eval_file) - def _print_detection_metrics(self, coco_eval): IoU_lo_thresh = 0.5 IoU_hi_thresh = 0.95 diff --git a/example/rcnn/symimdb/imdb.py b/example/rcnn/symimdb/imdb.py new file mode 100644 index 000000000000..f78c6eae199e --- /dev/null +++ b/example/rcnn/symimdb/imdb.py @@ -0,0 +1,126 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Main functions of real IMDB includes: +_load_gt_roidb +_evaluate_detections + +General functions: +property: name, classes, num_classes, roidb, num_images +append_flipped_images +evaluate_detections + +roidb is a list of roi_rec +roi_rec is a dict of keys ["index", "image", "height", "width", "boxes", "gt_classes", "flipped"] +""" + +from symnet.logger import logger +import os +try: + import cPickle as pickle +except ImportError: + import pickle + + +class IMDB(object): + classes = [] + + def __init__(self, name, root_path): + """ + basic information about an image database + :param root_path: root path store cache and proposal data + """ + self._name = name + self._root_path = root_path + + # abstract attributes + self._classes = [] + self._roidb = [] + + # create cache + cache_folder = os.path.join(self._root_path, 'cache') + if not os.path.exists(cache_folder): + os.mkdir(cache_folder) + + @property + def name(self): + return self._name + + @property + def num_classes(self): + return len(self.classes) + + @property + def roidb(self): + return self._roidb + + @property + def num_images(self): + return len(self._roidb) + + def filter_roidb(self): + """Remove images without usable rois""" + num_roidb = len(self._roidb) + self._roidb = [roi_rec for roi_rec in self._roidb if len(roi_rec['gt_classes'])] + num_after = len(self._roidb) + logger.info('filter roidb: {} -> {}'.format(num_roidb, num_after)) + + def append_flipped_images(self): + """Only flip boxes coordinates, images will be flipped when loading into network""" + logger.info('%s append flipped images to roidb' % self._name) + roidb_flipped = [] + for roi_rec in self._roidb: + boxes = roi_rec['boxes'].copy() + oldx1 = boxes[:, 0].copy() + oldx2 = boxes[:, 2].copy() + boxes[:, 0] = roi_rec['width'] - oldx2 - 1 + boxes[:, 2] = roi_rec['width'] - oldx1 - 1 + assert (boxes[:, 2] >= boxes[:, 0]).all() + roi_rec_flipped = roi_rec.copy() + roi_rec_flipped['boxes'] = boxes + roi_rec_flipped['flipped'] = True + roidb_flipped.append(roi_rec_flipped) + self._roidb.extend(roidb_flipped) + + def evaluate_detections(self, detections, **kwargs): + cache_path = os.path.join(self._root_path, 'cache', '{}_{}.pkl'.format(self._name, 'detections')) + logger.info('saving cache {}'.format(cache_path)) + with open(cache_path, 'wb') as fid: + pickle.dump(detections, fid, pickle.HIGHEST_PROTOCOL) + self._evaluate_detections(detections, **kwargs) + + def _get_cached(self, cache_item, fn): + cache_path = os.path.join(self._root_path, 'cache', '{}_{}.pkl'.format(self._name, cache_item)) + if os.path.exists(cache_path): + logger.info('loading cache {}'.format(cache_path)) + with open(cache_path, 'rb') as fid: + cached = pickle.load(fid) + return cached + else: + logger.info('computing cache {}'.format(cache_path)) + cached = fn() + logger.info('saving cache {}'.format(cache_path)) + with open(cache_path, 'wb') as fid: + pickle.dump(cached, fid, pickle.HIGHEST_PROTOCOL) + return cached + + def _load_gt_roidb(self): + raise NotImplementedError + + def _evaluate_detections(self, detections, **kwargs): + raise NotImplementedError diff --git a/example/rcnn/symimdb/pascal_voc.py b/example/rcnn/symimdb/pascal_voc.py new file mode 100644 index 000000000000..cebc59132918 --- /dev/null +++ b/example/rcnn/symimdb/pascal_voc.py @@ -0,0 +1,269 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import os +import numpy as np + +from symnet.logger import logger +from .imdb import IMDB + + +class PascalVOC(IMDB): + classes = ['__background__', # always index 0 + 'aeroplane', 'bicycle', 'bird', 'boat', + 'bottle', 'bus', 'car', 'cat', 'chair', + 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', + 'sheep', 'sofa', 'train', 'tvmonitor'] + + def __init__(self, image_set, root_path, devkit_path): + """ + fill basic information to initialize imdb + :param image_set: 2007_trainval, 2007_test, etc + :param root_path: 'data', will write 'cache' + :param devkit_path: 'data/VOCdevkit', load data and write results + """ + super(PascalVOC, self).__init__('voc_' + image_set, root_path) + + year, image_set = image_set.split('_') + self._config = {'comp_id': 'comp4', + 'use_diff': False, + 'min_size': 2} + self._class_to_ind = dict(zip(self.classes, range(self.num_classes))) + self._image_index_file = os.path.join(devkit_path, 'VOC' + year, 'ImageSets', 'Main', image_set + '.txt') + self._image_file_tmpl = os.path.join(devkit_path, 'VOC' + year, 'JPEGImages', '{}.jpg') + self._image_anno_tmpl = os.path.join(devkit_path, 'VOC' + year, 'Annotations', '{}.xml') + + # results + result_folder = os.path.join(devkit_path, 'results', 'VOC' + year, 'Main') + if not os.path.exists(result_folder): + os.makedirs(result_folder) + self._result_file_tmpl = os.path.join(result_folder, 'comp4_det_' + image_set + '_{}.txt') + + # get roidb + self._roidb = self._get_cached('roidb', self._load_gt_roidb) + logger.info('%s num_images %d' % (self.name, self.num_images)) + + def _load_gt_roidb(self): + image_index = self._load_image_index() + gt_roidb = [self._load_annotation(index) for index in image_index] + return gt_roidb + + def _load_image_index(self): + with open(self._image_index_file) as f: + image_set_index = [x.strip() for x in f.readlines()] + return image_set_index + + def _load_annotation(self, index): + # store original annotation as orig_objs + height, width, orig_objs = self._parse_voc_anno(self._image_anno_tmpl.format(index)) + + # filter difficult objects + if not self._config['use_diff']: + non_diff_objs = [obj for obj in orig_objs if obj['difficult'] == 0] + objs = non_diff_objs + else: + objs = orig_objs + num_objs = len(objs) + + boxes = np.zeros((num_objs, 4), dtype=np.uint16) + gt_classes = np.zeros((num_objs,), dtype=np.int32) + # Load object bounding boxes into a data frame. + for ix, obj in enumerate(objs): + # Make pixel indexes 0-based + x1 = obj['bbox'][0] - 1 + y1 = obj['bbox'][1] - 1 + x2 = obj['bbox'][2] - 1 + y2 = obj['bbox'][3] - 1 + cls = self._class_to_ind[obj['name'].lower().strip()] + boxes[ix, :] = [x1, y1, x2, y2] + gt_classes[ix] = cls + + roi_rec = {'index': index, + 'objs': orig_objs, + 'image': self._image_file_tmpl.format(index), + 'height': height, + 'width': width, + 'boxes': boxes, + 'gt_classes': gt_classes, + 'flipped': False} + return roi_rec + + @staticmethod + def _parse_voc_anno(filename): + import xml.etree.ElementTree as ET + tree = ET.parse(filename) + height = int(tree.find('size').find('height').text) + width = int(tree.find('size').find('width').text) + objects = [] + for obj in tree.findall('object'): + obj_dict = dict() + obj_dict['name'] = obj.find('name').text + obj_dict['difficult'] = int(obj.find('difficult').text) + bbox = obj.find('bndbox') + obj_dict['bbox'] = [int(float(bbox.find('xmin').text)), + int(float(bbox.find('ymin').text)), + int(float(bbox.find('xmax').text)), + int(float(bbox.find('ymax').text))] + objects.append(obj_dict) + return height, width, objects + + def _evaluate_detections(self, detections, use_07_metric=True, **kargs): + self._write_pascal_results(detections) + self._do_python_eval(detections, use_07_metric) + + def _write_pascal_results(self, all_boxes): + for cls_ind, cls in enumerate(self.classes): + if cls == '__background__': + continue + logger.info('Writing %s VOC results file' % cls) + filename = self._result_file_tmpl.format(cls) + with open(filename, 'wt') as f: + for im_ind, roi_rec in enumerate(self.roidb): + index = roi_rec['index'] + dets = all_boxes[cls_ind][im_ind] + if len(dets) == 0: + continue + # the VOCdevkit expects 1-based indices + for k in range(dets.shape[0]): + f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. + format(index, dets[k, -1], + dets[k, 0] + 1, dets[k, 1] + 1, dets[k, 2] + 1, dets[k, 3] + 1)) + + def _do_python_eval(self, all_boxes, use_07_metric): + aps = [] + for cls_ind, cls in enumerate(self.classes): + if cls == '__background__': + continue + # class_anno is a dict [image_index, [bbox, difficult, det]] + class_anno = {} + npos = 0 + for roi_rec in self.roidb: + index = roi_rec['index'] + objects = [obj for obj in roi_rec['objs'] if obj['name'] == cls] + bbox = np.array([x['bbox'] for x in objects]) + difficult = np.array([x['difficult'] for x in objects]).astype(np.bool) + det = [False] * len(objects) # stand for detected + npos = npos + sum(~difficult) + class_anno[index] = {'bbox': bbox, + 'difficult': difficult, + 'det': det} + + # bbox is 2d array of all detections, corresponding to each image_id + image_ids = [] + bbox = [] + confidence = [] + for im_ind, dets in enumerate(all_boxes[cls_ind]): + for k in range(dets.shape[0]): + image_ids.append(self.roidb[im_ind]['index']) + bbox.append([dets[k, 0] + 1, dets[k, 1] + 1, dets[k, 2] + 1, dets[k, 3] + 1]) + confidence.append(dets[k, -1]) + bbox = np.array(bbox) + confidence = np.array(confidence) + + rec, prec, ap = self.voc_eval(class_anno, npos, image_ids, bbox, confidence, + ovthresh=0.5, use_07_metric=use_07_metric) + aps.append(ap) + + for cls, ap in zip(self.classes, aps): + logger.info('AP for {} = {:.4f}'.format(cls, ap)) + logger.info('Mean AP = {:.4f}'.format(np.mean(aps))) + + @staticmethod + def voc_eval(class_anno, npos, image_ids, bbox, confidence, ovthresh=0.5, use_07_metric=False): + # sort by confidence + if bbox.shape[0] > 0: + sorted_inds = np.argsort(-confidence) + sorted_scores = np.sort(-confidence) + bbox = bbox[sorted_inds, :] + image_ids = [image_ids[x] for x in sorted_inds] + + # go down detections and mark true positives and false positives + nd = len(image_ids) + tp = np.zeros(nd) + fp = np.zeros(nd) + for d in range(nd): + r = class_anno[image_ids[d]] + bb = bbox[d, :].astype(float) + ovmax = -np.inf + bbgt = r['bbox'].astype(float) + + if bbgt.size > 0: + # compute overlaps + # intersection + ixmin = np.maximum(bbgt[:, 0], bb[0]) + iymin = np.maximum(bbgt[:, 1], bb[1]) + ixmax = np.minimum(bbgt[:, 2], bb[2]) + iymax = np.minimum(bbgt[:, 3], bb[3]) + iw = np.maximum(ixmax - ixmin + 1., 0.) + ih = np.maximum(iymax - iymin + 1., 0.) + inters = iw * ih + + # union + uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + + (bbgt[:, 2] - bbgt[:, 0] + 1.) * + (bbgt[:, 3] - bbgt[:, 1] + 1.) - inters) + + overlaps = inters / uni + ovmax = np.max(overlaps) + jmax = np.argmax(overlaps) + + if ovmax > ovthresh: + if not r['difficult'][jmax]: + if not r['det'][jmax]: + tp[d] = 1. + r['det'][jmax] = 1 + else: + fp[d] = 1. + else: + fp[d] = 1. + + # compute precision recall + fp = np.cumsum(fp) + tp = np.cumsum(tp) + rec = tp / float(npos) + # avoid division by zero in case first detection matches a difficult ground ruth + prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) + ap = PascalVOC.voc_ap(rec, prec, use_07_metric) + + return rec, prec, ap + + @staticmethod + def voc_ap(rec, prec, use_07_metric=False): + if use_07_metric: + ap = 0. + for t in np.arange(0., 1.1, 0.1): + if np.sum(rec >= t) == 0: + p = 0 + else: + p = np.max(prec[rec >= t]) + ap += p / 11. + else: + # append sentinel values at both ends + mrec = np.concatenate(([0.], rec, [1.])) + mpre = np.concatenate(([0.], prec, [0.])) + + # compute precision integration ladder + for i in range(mpre.size - 1, 0, -1): + mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) + + # look for recall value changes + i = np.where(mrec[1:] != mrec[:-1])[0] + + # sum (\delta recall) * prec + ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) + return ap diff --git a/example/rcnn/rcnn/cython/__init__.py b/example/rcnn/symnet/__init__.py similarity index 100% rename from example/rcnn/rcnn/cython/__init__.py rename to example/rcnn/symnet/__init__.py diff --git a/example/rcnn/rcnn/logger.py b/example/rcnn/symnet/logger.py similarity index 100% rename from example/rcnn/rcnn/logger.py rename to example/rcnn/symnet/logger.py diff --git a/example/rcnn/rcnn/core/metric.py b/example/rcnn/symnet/metric.py similarity index 78% rename from example/rcnn/rcnn/core/metric.py rename to example/rcnn/symnet/metric.py index d33edb65beda..fa8d7919e919 100644 --- a/example/rcnn/rcnn/core/metric.py +++ b/example/rcnn/symnet/metric.py @@ -18,30 +18,17 @@ import mxnet as mx import numpy as np -from rcnn.config import config - -def get_rpn_names(): - pred = ['rpn_cls_prob', 'rpn_bbox_loss'] +def get_names(): + pred = ['rpn_cls_prob', 'rpn_bbox_loss', 'rcnn_cls_prob', 'rcnn_bbox_loss', 'rcnn_label'] label = ['rpn_label', 'rpn_bbox_target', 'rpn_bbox_weight'] return pred, label -def get_rcnn_names(): - pred = ['rcnn_cls_prob', 'rcnn_bbox_loss'] - label = ['rcnn_label', 'rcnn_bbox_target', 'rcnn_bbox_weight'] - if config.TRAIN.END2END: - pred.append('rcnn_label') - rpn_pred, rpn_label = get_rpn_names() - pred = rpn_pred + pred - label = rpn_label - return pred, label - - class RPNAccMetric(mx.metric.EvalMetric): def __init__(self): super(RPNAccMetric, self).__init__('RPNAcc') - self.pred, self.label = get_rpn_names() + self.pred, self.label = get_names() def update(self, labels, preds): pred = preds[self.pred.index('rpn_cls_prob')] @@ -65,15 +52,11 @@ def update(self, labels, preds): class RCNNAccMetric(mx.metric.EvalMetric): def __init__(self): super(RCNNAccMetric, self).__init__('RCNNAcc') - self.e2e = config.TRAIN.END2END - self.pred, self.label = get_rcnn_names() + self.pred, self.label = get_names() def update(self, labels, preds): pred = preds[self.pred.index('rcnn_cls_prob')] - if self.e2e: - label = preds[self.pred.index('rcnn_label')] - else: - label = labels[self.label.index('rcnn_label')] + label = preds[self.pred.index('rcnn_label')] last_dim = pred.shape[-1] pred_label = pred.asnumpy().reshape(-1, last_dim).argmax(axis=1).astype('int32') @@ -86,7 +69,7 @@ def update(self, labels, preds): class RPNLogLossMetric(mx.metric.EvalMetric): def __init__(self): super(RPNLogLossMetric, self).__init__('RPNLogLoss') - self.pred, self.label = get_rpn_names() + self.pred, self.label = get_names() def update(self, labels, preds): pred = preds[self.pred.index('rpn_cls_prob')] @@ -113,15 +96,11 @@ def update(self, labels, preds): class RCNNLogLossMetric(mx.metric.EvalMetric): def __init__(self): super(RCNNLogLossMetric, self).__init__('RCNNLogLoss') - self.e2e = config.TRAIN.END2END - self.pred, self.label = get_rcnn_names() + self.pred, self.label = get_names() def update(self, labels, preds): pred = preds[self.pred.index('rcnn_cls_prob')] - if self.e2e: - label = preds[self.pred.index('rcnn_label')] - else: - label = labels[self.label.index('rcnn_label')] + label = preds[self.pred.index('rcnn_label')] last_dim = pred.shape[-1] pred = pred.asnumpy().reshape(-1, last_dim) @@ -138,7 +117,7 @@ def update(self, labels, preds): class RPNL1LossMetric(mx.metric.EvalMetric): def __init__(self): super(RPNL1LossMetric, self).__init__('RPNL1Loss') - self.pred, self.label = get_rpn_names() + self.pred, self.label = get_names() def update(self, labels, preds): bbox_loss = preds[self.pred.index('rpn_bbox_loss')].asnumpy() @@ -154,15 +133,11 @@ def update(self, labels, preds): class RCNNL1LossMetric(mx.metric.EvalMetric): def __init__(self): super(RCNNL1LossMetric, self).__init__('RCNNL1Loss') - self.e2e = config.TRAIN.END2END - self.pred, self.label = get_rcnn_names() + self.pred, self.label = get_names() def update(self, labels, preds): bbox_loss = preds[self.pred.index('rcnn_bbox_loss')].asnumpy() - if self.e2e: - label = preds[self.pred.index('rcnn_label')].asnumpy() - else: - label = labels[self.label.index('rcnn_label')].asnumpy() + label = preds[self.pred.index('rcnn_label')].asnumpy() # calculate num_inst keep_inds = np.where(label != 0)[0] diff --git a/example/rcnn/symnet/model.py b/example/rcnn/symnet/model.py new file mode 100644 index 000000000000..f615d58e4107 --- /dev/null +++ b/example/rcnn/symnet/model.py @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx + + +def load_param(params, ctx=None): + """same as mx.model.load_checkpoint, but do not load symnet and will convert context""" + if ctx is None: + ctx = mx.cpu() + save_dict = mx.nd.load(params) + arg_params = {} + aux_params = {} + for k, v in save_dict.items(): + tp, name = k.split(':', 1) + if tp == 'arg': + arg_params[name] = v.as_in_context(ctx) + if tp == 'aux': + aux_params[name] = v.as_in_context(ctx) + return arg_params, aux_params + + +def infer_param_shape(symbol, data_shapes): + arg_shape, _, aux_shape = symbol.infer_shape(**dict(data_shapes)) + arg_shape_dict = dict(zip(symbol.list_arguments(), arg_shape)) + aux_shape_dict = dict(zip(symbol.list_auxiliary_states(), aux_shape)) + return arg_shape_dict, aux_shape_dict + + +def infer_data_shape(symbol, data_shapes): + _, out_shape, _ = symbol.infer_shape(**dict(data_shapes)) + data_shape_dict = dict(data_shapes) + out_shape_dict = dict(zip(symbol.list_outputs(), out_shape)) + return data_shape_dict, out_shape_dict + + +def check_shape(symbol, data_shapes, arg_params, aux_params): + arg_shape_dict, aux_shape_dict = infer_param_shape(symbol, data_shapes) + data_shape_dict, out_shape_dict = infer_data_shape(symbol, data_shapes) + for k in symbol.list_arguments(): + if k in data_shape_dict or 'label' in k: + continue + assert k in arg_params, '%s not initialized' % k + assert arg_params[k].shape == arg_shape_dict[k], \ + 'shape inconsistent for %s inferred %s provided %s' % (k, arg_shape_dict[k], arg_params[k].shape) + for k in symbol.list_auxiliary_states(): + assert k in aux_params, '%s not initialized' % k + assert aux_params[k].shape == aux_shape_dict[k], \ + 'shape inconsistent for %s inferred %s provided %s' % (k, aux_shape_dict[k], aux_params[k].shape) + + +def initialize_frcnn(symbol, data_shapes, arg_params, aux_params): + arg_shape_dict, aux_shape_dict = infer_param_shape(symbol, data_shapes) + arg_params['rpn_conv_3x3_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_conv_3x3_weight']) + arg_params['rpn_conv_3x3_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_conv_3x3_bias']) + arg_params['rpn_cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_cls_score_weight']) + arg_params['rpn_cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_cls_score_bias']) + arg_params['rpn_bbox_pred_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_bbox_pred_weight']) + arg_params['rpn_bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_bbox_pred_bias']) + arg_params['cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['cls_score_weight']) + arg_params['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias']) + arg_params['bbox_pred_weight'] = mx.random.normal(0, 0.001, shape=arg_shape_dict['bbox_pred_weight']) + arg_params['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias']) + return arg_params, aux_params + + +def get_fixed_params(symbol, fixed_param_prefix=''): + fixed_param_names = [] + if fixed_param_prefix: + for name in symbol.list_arguments(): + for prefix in fixed_param_prefix: + if prefix in name: + fixed_param_names.append(name) + return fixed_param_names diff --git a/example/rcnn/symnet/proposal_target.py b/example/rcnn/symnet/proposal_target.py new file mode 100644 index 000000000000..926720b28b5f --- /dev/null +++ b/example/rcnn/symnet/proposal_target.py @@ -0,0 +1,175 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +Proposal Target Operator selects foreground and background roi and assigns label, bbox_transform to them. +""" + +import mxnet as mx +import numpy as np + +from symdata.bbox import bbox_overlaps, bbox_transform + + +def sample_rois(rois, gt_boxes, num_classes, rois_per_image, fg_rois_per_image, fg_overlap, box_stds): + """ + generate random sample of ROIs comprising foreground and background examples + :param rois: [n, 5] (batch_index, x1, y1, x2, y2) + :param gt_boxes: [n, 5] (x1, y1, x2, y2, cls) + :param num_classes: number of classes + :param rois_per_image: total roi number + :param fg_rois_per_image: foreground roi number + :param fg_overlap: overlap threshold for fg rois + :param box_stds: std var of bbox reg + :return: (labels, rois, bbox_targets, bbox_weights) + """ + overlaps = bbox_overlaps(rois[:, 1:], gt_boxes[:, :4]) + gt_assignment = overlaps.argmax(axis=1) + labels = gt_boxes[gt_assignment, 4] + max_overlaps = overlaps.max(axis=1) + + # select foreground RoI with FG_THRESH overlap + fg_indexes = np.where(max_overlaps >= fg_overlap)[0] + # guard against the case when an image has fewer than fg_rois_per_image foreground RoIs + fg_rois_this_image = min(fg_rois_per_image, len(fg_indexes)) + # sample foreground regions without replacement + if len(fg_indexes) > fg_rois_this_image: + fg_indexes = np.random.choice(fg_indexes, size=fg_rois_this_image, replace=False) + + # select background RoIs as those within [0, FG_THRESH) + bg_indexes = np.where(max_overlaps < fg_overlap)[0] + # compute number of background RoIs to take from this image (guarding against there being fewer than desired) + bg_rois_this_image = rois_per_image - fg_rois_this_image + bg_rois_this_image = min(bg_rois_this_image, len(bg_indexes)) + # sample bg rois without replacement + if len(bg_indexes) > bg_rois_this_image: + bg_indexes = np.random.choice(bg_indexes, size=bg_rois_this_image, replace=False) + + # indexes selected + keep_indexes = np.append(fg_indexes, bg_indexes) + # pad more bg rois to ensure a fixed minibatch size + while len(keep_indexes) < rois_per_image: + gap = min(len(bg_indexes), rois_per_image - len(keep_indexes)) + gap_indexes = np.random.choice(range(len(bg_indexes)), size=gap, replace=False) + keep_indexes = np.append(keep_indexes, bg_indexes[gap_indexes]) + + # sample rois and labels + rois = rois[keep_indexes] + labels = labels[keep_indexes] + # set labels of bg rois to be 0 + labels[fg_rois_this_image:] = 0 + + # load or compute bbox_target + targets = bbox_transform(rois[:, 1:], gt_boxes[gt_assignment[keep_indexes], :4], box_stds=box_stds) + bbox_targets = np.zeros((rois_per_image, 4 * num_classes), dtype=np.float32) + bbox_weights = np.zeros((rois_per_image, 4 * num_classes), dtype=np.float32) + for i in range(fg_rois_this_image): + cls_ind = int(labels[i]) + bbox_targets[i, cls_ind * 4:(cls_ind + 1) * 4] = targets[i] + bbox_weights[i, cls_ind * 4:(cls_ind + 1) * 4] = 1 + + return rois, labels, bbox_targets, bbox_weights + + +class ProposalTargetOperator(mx.operator.CustomOp): + def __init__(self, num_classes, batch_images, batch_rois, fg_fraction, fg_overlap, box_stds): + super(ProposalTargetOperator, self).__init__() + self._num_classes = num_classes + self._batch_images = batch_images + self._batch_rois = batch_rois + self._rois_per_image = int(batch_rois / batch_images) + self._fg_rois_per_image = int(round(fg_fraction * self._rois_per_image)) + self._fg_overlap = fg_overlap + self._box_stds = box_stds + + def forward(self, is_train, req, in_data, out_data, aux): + assert self._batch_images == in_data[1].shape[0], 'check batch size of gt_boxes' + + all_rois = in_data[0].asnumpy() + all_gt_boxes = in_data[1].asnumpy() + + rois = np.empty((0, 5), dtype=np.float32) + labels = np.empty((0, ), dtype=np.float32) + bbox_targets = np.empty((0, 4 * self._num_classes), dtype=np.float32) + bbox_weights = np.empty((0, 4 * self._num_classes), dtype=np.float32) + for batch_idx in range(self._batch_images): + b_rois = all_rois[np.where(all_rois[:, 0] == batch_idx)[0]] + b_gt_boxes = all_gt_boxes[batch_idx] + b_gt_boxes = b_gt_boxes[np.where(b_gt_boxes[:, -1] > 0)[0]] + + # Include ground-truth boxes in the set of candidate rois + batch_pad = batch_idx * np.ones((b_gt_boxes.shape[0], 1), dtype=b_gt_boxes.dtype) + b_rois = np.vstack((b_rois, np.hstack((batch_pad, b_gt_boxes[:, :-1])))) + + b_rois, b_labels, b_bbox_targets, b_bbox_weights = \ + sample_rois(b_rois, b_gt_boxes, num_classes=self._num_classes, rois_per_image=self._rois_per_image, + fg_rois_per_image=self._fg_rois_per_image, fg_overlap=self._fg_overlap, box_stds=self._box_stds) + + rois = np.vstack((rois, b_rois)) + labels = np.hstack((labels, b_labels)) + bbox_targets = np.vstack((bbox_targets, b_bbox_targets)) + bbox_weights = np.vstack((bbox_weights, b_bbox_weights)) + + self.assign(out_data[0], req[0], rois) + self.assign(out_data[1], req[1], labels) + self.assign(out_data[2], req[2], bbox_targets) + self.assign(out_data[3], req[3], bbox_weights) + + def backward(self, req, out_grad, in_data, out_data, in_grad, aux): + self.assign(in_grad[0], req[0], 0) + self.assign(in_grad[1], req[1], 0) + + +@mx.operator.register('proposal_target') +class ProposalTargetProp(mx.operator.CustomOpProp): + def __init__(self, num_classes='21', batch_images='1', batch_rois='128', fg_fraction='0.25', + fg_overlap='0.5', box_stds='(0.1, 0.1, 0.2, 0.2)'): + super(ProposalTargetProp, self).__init__(need_top_grad=False) + self._num_classes = int(num_classes) + self._batch_images = int(batch_images) + self._batch_rois = int(batch_rois) + self._fg_fraction = float(fg_fraction) + self._fg_overlap = float(fg_overlap) + self._box_stds = tuple(np.fromstring(box_stds[1:-1], dtype=float, sep=',')) + + def list_arguments(self): + return ['rois', 'gt_boxes'] + + def list_outputs(self): + return ['rois_output', 'label', 'bbox_target', 'bbox_weight'] + + def infer_shape(self, in_shape): + assert self._batch_rois % self._batch_images == 0, \ + 'BATCHIMAGES {} must devide BATCH_ROIS {}'.format(self._batch_images, self._batch_rois) + + rpn_rois_shape = in_shape[0] + gt_boxes_shape = in_shape[1] + + output_rois_shape = (self._batch_rois, 5) + label_shape = (self._batch_rois, ) + bbox_target_shape = (self._batch_rois, self._num_classes * 4) + bbox_weight_shape = (self._batch_rois, self._num_classes * 4) + + return [rpn_rois_shape, gt_boxes_shape], \ + [output_rois_shape, label_shape, bbox_target_shape, bbox_weight_shape] + + def create_operator(self, ctx, shapes, dtypes): + return ProposalTargetOperator(self._num_classes, self._batch_images, self._batch_rois, self._fg_fraction, + self._fg_overlap, self._box_stds) + + def declare_backward_dependency(self, out_grad, in_data, out_data): + return [] diff --git a/example/rcnn/rcnn/symbol/symbol_resnet.py b/example/rcnn/symnet/symbol_resnet.py similarity index 59% rename from example/rcnn/rcnn/symbol/symbol_resnet.py rename to example/rcnn/symnet/symbol_resnet.py index f7721366c17c..16b05533e232 100644 --- a/example/rcnn/rcnn/symbol/symbol_resnet.py +++ b/example/rcnn/symnet/symbol_resnet.py @@ -16,16 +16,11 @@ # under the License. import mxnet as mx -from rcnn.config import config -from . import proposal from . import proposal_target -eps = 2e-5 -use_global_stats = True -workspace = 512 -res_deps = {'50': (3, 4, 6, 3), '101': (3, 4, 23, 3), '152': (3, 8, 36, 3), '200': (3, 24, 36, 3)} -units = res_deps['101'] -filter_list = [256, 512, 1024, 2048] +eps=2e-5 +use_global_stats=True +workspace=1024 def residual_unit(data, num_filter, stride, dim_match, name): @@ -50,7 +45,7 @@ def residual_unit(data, num_filter, stride, dim_match, name): return sum -def get_resnet_conv(data): +def get_resnet_feature(data, units, filter_list): # res1 data_bn = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=eps, use_global_stats=use_global_stats, name='bn_data') conv0 = mx.sym.Convolution(data=data_bn, num_filter=64, kernel=(7, 7), stride=(2, 2), pad=(3, 3), @@ -76,7 +71,23 @@ def get_resnet_conv(data): return unit -def get_resnet_train(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS): +def get_resnet_top_feature(data, units, filter_list): + unit = residual_unit(data=data, num_filter=filter_list[3], stride=(2, 2), dim_match=False, name='stage4_unit1') + for i in range(2, units[3] + 1): + unit = residual_unit(data=unit, num_filter=filter_list[3], stride=(1, 1), dim_match=True, name='stage4_unit%s' % i) + bn1 = mx.sym.BatchNorm(data=unit, fix_gamma=False, eps=eps, use_global_stats=use_global_stats, name='bn1') + relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1') + pool1 = mx.symbol.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1') + return pool1 + + +def get_resnet_train(anchor_scales, anchor_ratios, rpn_feature_stride, + rpn_pre_topk, rpn_post_topk, rpn_nms_thresh, rpn_min_size, rpn_batch_rois, + num_classes, rcnn_feature_stride, rcnn_pooled_size, rcnn_batch_size, + rcnn_batch_rois, rcnn_fg_fraction, rcnn_fg_overlap, rcnn_bbox_stds, + units, filter_list): + num_anchors = len(anchor_scales) * len(anchor_ratios) + data = mx.symbol.Variable(name="data") im_info = mx.symbol.Variable(name="im_info") gt_boxes = mx.symbol.Variable(name="gt_boxes") @@ -85,144 +96,129 @@ def get_resnet_train(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCH rpn_bbox_weight = mx.symbol.Variable(name='bbox_weight') # shared convolutional layers - conv_feat = get_resnet_conv(data) + conv_feat = get_resnet_feature(data, units=units, filter_list=filter_list) - # RPN layers + # rpn feature rpn_conv = mx.symbol.Convolution( data=conv_feat, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3") rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu") + + # rpn classification rpn_cls_score = mx.symbol.Convolution( data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score") - rpn_bbox_pred = mx.symbol.Convolution( - data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") - - # prepare rpn data rpn_cls_score_reshape = mx.symbol.Reshape( data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") - - # classification rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, label=rpn_label, multi_output=True, normalization='valid', use_ignore=True, ignore_label=-1, name="rpn_cls_prob") - # bounding box regression - rpn_bbox_loss_ = rpn_bbox_weight * mx.symbol.smooth_l1(name='rpn_bbox_loss_', scalar=3.0, data=(rpn_bbox_pred - rpn_bbox_target)) - rpn_bbox_loss = mx.sym.MakeLoss(name='rpn_bbox_loss', data=rpn_bbox_loss_, grad_scale=1.0 / config.TRAIN.RPN_BATCH_SIZE) - - # ROI proposal - rpn_cls_act = mx.symbol.SoftmaxActivation( - data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_act") + rpn_cls_act = mx.symbol.softmax( + data=rpn_cls_score_reshape, axis=1, name="rpn_cls_act") rpn_cls_act_reshape = mx.symbol.Reshape( data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape') - if config.TRAIN.CXX_PROPOSAL: - rois = mx.symbol.contrib.Proposal( - cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', - feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS), - rpn_pre_nms_top_n=config.TRAIN.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TRAIN.RPN_POST_NMS_TOP_N, - threshold=config.TRAIN.RPN_NMS_THRESH, rpn_min_size=config.TRAIN.RPN_MIN_SIZE) - else: - rois = mx.symbol.Custom( - cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', - op_type='proposal', feat_stride=config.RPN_FEAT_STRIDE, - scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS), - rpn_pre_nms_top_n=config.TRAIN.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TRAIN.RPN_POST_NMS_TOP_N, - threshold=config.TRAIN.RPN_NMS_THRESH, rpn_min_size=config.TRAIN.RPN_MIN_SIZE) - - # ROI proposal target - gt_boxes_reshape = mx.symbol.Reshape(data=gt_boxes, shape=(-1, 5), name='gt_boxes_reshape') - group = mx.symbol.Custom(rois=rois, gt_boxes=gt_boxes_reshape, op_type='proposal_target', - num_classes=num_classes, batch_images=config.TRAIN.BATCH_IMAGES, - batch_rois=config.TRAIN.BATCH_ROIS, fg_fraction=config.TRAIN.FG_FRACTION) + + # rpn bbox regression + rpn_bbox_pred = mx.symbol.Convolution( + data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") + rpn_bbox_loss_ = rpn_bbox_weight * mx.symbol.smooth_l1(name='rpn_bbox_loss_', scalar=3.0, data=(rpn_bbox_pred - rpn_bbox_target)) + rpn_bbox_loss = mx.sym.MakeLoss(name='rpn_bbox_loss', data=rpn_bbox_loss_, grad_scale=1.0 / rpn_batch_rois) + + # rpn proposal + rois = mx.symbol.contrib.MultiProposal( + cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', + feature_stride=rpn_feature_stride, scales=anchor_scales, ratios=anchor_ratios, + rpn_pre_nms_top_n=rpn_pre_topk, rpn_post_nms_top_n=rpn_post_topk, + threshold=rpn_nms_thresh, rpn_min_size=rpn_min_size) + + # rcnn roi proposal target + group = mx.symbol.Custom(rois=rois, gt_boxes=gt_boxes, op_type='proposal_target', + num_classes=num_classes, batch_images=rcnn_batch_size, + batch_rois=rcnn_batch_rois, fg_fraction=rcnn_fg_fraction, + fg_overlap=rcnn_fg_overlap, box_stds=rcnn_bbox_stds) rois = group[0] label = group[1] bbox_target = group[2] bbox_weight = group[3] - # Fast R-CNN + # rcnn roi pool roi_pool = mx.symbol.ROIPooling( - name='roi_pool5', data=conv_feat, rois=rois, pooled_size=(14, 14), spatial_scale=1.0 / config.RCNN_FEAT_STRIDE) + name='roi_pool', data=conv_feat, rois=rois, pooled_size=rcnn_pooled_size, spatial_scale=1.0 / rcnn_feature_stride) - # res5 - unit = residual_unit(data=roi_pool, num_filter=filter_list[3], stride=(2, 2), dim_match=False, name='stage4_unit1') - for i in range(2, units[3] + 1): - unit = residual_unit(data=unit, num_filter=filter_list[3], stride=(1, 1), dim_match=True, name='stage4_unit%s' % i) - bn1 = mx.sym.BatchNorm(data=unit, fix_gamma=False, eps=eps, use_global_stats=use_global_stats, name='bn1') - relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1') - pool1 = mx.symbol.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1') + # rcnn top feature + top_feat = get_resnet_top_feature(roi_pool, units=units, filter_list=filter_list) - # classification - cls_score = mx.symbol.FullyConnected(name='cls_score', data=pool1, num_hidden=num_classes) + # rcnn classification + cls_score = mx.symbol.FullyConnected(name='cls_score', data=top_feat, num_hidden=num_classes) cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score, label=label, normalization='batch') - # bounding box regression - bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=pool1, num_hidden=num_classes * 4) + + # rcnn bbox regression + bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=top_feat, num_hidden=num_classes * 4) bbox_loss_ = bbox_weight * mx.symbol.smooth_l1(name='bbox_loss_', scalar=1.0, data=(bbox_pred - bbox_target)) - bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_, grad_scale=1.0 / config.TRAIN.BATCH_ROIS) + bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_, grad_scale=1.0 / rcnn_batch_rois) # reshape output - label = mx.symbol.Reshape(data=label, shape=(config.TRAIN.BATCH_IMAGES, -1), name='label_reshape') - cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TRAIN.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape') - bbox_loss = mx.symbol.Reshape(data=bbox_loss, shape=(config.TRAIN.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_loss_reshape') + label = mx.symbol.Reshape(data=label, shape=(rcnn_batch_size, -1), name='label_reshape') + cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(rcnn_batch_size, -1, num_classes), name='cls_prob_reshape') + bbox_loss = mx.symbol.Reshape(data=bbox_loss, shape=(rcnn_batch_size, -1, 4 * num_classes), name='bbox_loss_reshape') + # group output group = mx.symbol.Group([rpn_cls_prob, rpn_bbox_loss, cls_prob, bbox_loss, mx.symbol.BlockGrad(label)]) return group -def get_resnet_test(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS): +def get_resnet_test(anchor_scales, anchor_ratios, rpn_feature_stride, + rpn_pre_topk, rpn_post_topk, rpn_nms_thresh, rpn_min_size, + num_classes, rcnn_feature_stride, rcnn_pooled_size, rcnn_batch_size, + units, filter_list): + num_anchors = len(anchor_scales) * len(anchor_ratios) + data = mx.symbol.Variable(name="data") im_info = mx.symbol.Variable(name="im_info") # shared convolutional layers - conv_feat = get_resnet_conv(data) + conv_feat = get_resnet_feature(data, units=units, filter_list=filter_list) - # RPN + # rpn feature rpn_conv = mx.symbol.Convolution( data=conv_feat, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3") rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu") + + # rpn classification rpn_cls_score = mx.symbol.Convolution( data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score") + rpn_cls_score_reshape = mx.symbol.Reshape( + data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") + rpn_cls_act = mx.symbol.softmax( + data=rpn_cls_score_reshape, axis=1, name="rpn_cls_act") + rpn_cls_act_reshape = mx.symbol.Reshape( + data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape') + + # rpn bbox regression rpn_bbox_pred = mx.symbol.Convolution( data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") - # ROI Proposal - rpn_cls_score_reshape = mx.symbol.Reshape( - data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") - rpn_cls_prob = mx.symbol.SoftmaxActivation( - data=rpn_cls_score_reshape, mode="channel", name="rpn_cls_prob") - rpn_cls_prob_reshape = mx.symbol.Reshape( - data=rpn_cls_prob, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_prob_reshape') - if config.TEST.CXX_PROPOSAL: - rois = mx.symbol.contrib.Proposal( - cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', - feature_stride=config.RPN_FEAT_STRIDE, scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS), - rpn_pre_nms_top_n=config.TEST.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.RPN_POST_NMS_TOP_N, - threshold=config.TEST.RPN_NMS_THRESH, rpn_min_size=config.TEST.RPN_MIN_SIZE) - else: - rois = mx.symbol.Custom( - cls_prob=rpn_cls_prob_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', - op_type='proposal', feat_stride=config.RPN_FEAT_STRIDE, - scales=tuple(config.ANCHOR_SCALES), ratios=tuple(config.ANCHOR_RATIOS), - rpn_pre_nms_top_n=config.TEST.RPN_PRE_NMS_TOP_N, rpn_post_nms_top_n=config.TEST.RPN_POST_NMS_TOP_N, - threshold=config.TEST.RPN_NMS_THRESH, rpn_min_size=config.TEST.RPN_MIN_SIZE) - - # Fast R-CNN + # rpn proposal + rois = mx.symbol.contrib.MultiProposal( + cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', + feature_stride=rpn_feature_stride, scales=anchor_scales, ratios=anchor_ratios, + rpn_pre_nms_top_n=rpn_pre_topk, rpn_post_nms_top_n=rpn_post_topk, + threshold=rpn_nms_thresh, rpn_min_size=rpn_min_size) + + # rcnn roi pool roi_pool = mx.symbol.ROIPooling( - name='roi_pool5', data=conv_feat, rois=rois, pooled_size=(14, 14), spatial_scale=1.0 / config.RCNN_FEAT_STRIDE) + name='roi_pool', data=conv_feat, rois=rois, pooled_size=rcnn_pooled_size, spatial_scale=1.0 / rcnn_feature_stride) - # res5 - unit = residual_unit(data=roi_pool, num_filter=filter_list[3], stride=(2, 2), dim_match=False, name='stage4_unit1') - for i in range(2, units[3] + 1): - unit = residual_unit(data=unit, num_filter=filter_list[3], stride=(1, 1), dim_match=True, name='stage4_unit%s' % i) - bn1 = mx.sym.BatchNorm(data=unit, fix_gamma=False, eps=eps, use_global_stats=use_global_stats, name='bn1') - relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1') - pool1 = mx.symbol.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1') + # rcnn top feature + top_feat = get_resnet_top_feature(roi_pool, units=units, filter_list=filter_list) - # classification - cls_score = mx.symbol.FullyConnected(name='cls_score', data=pool1, num_hidden=num_classes) + # rcnn classification + cls_score = mx.symbol.FullyConnected(name='cls_score', data=top_feat, num_hidden=num_classes) cls_prob = mx.symbol.softmax(name='cls_prob', data=cls_score) - # bounding box regression - bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=pool1, num_hidden=num_classes * 4) + + # rcnn bbox regression + bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=top_feat, num_hidden=num_classes * 4) # reshape output - cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(config.TEST.BATCH_IMAGES, -1, num_classes), name='cls_prob_reshape') - bbox_pred = mx.symbol.Reshape(data=bbox_pred, shape=(config.TEST.BATCH_IMAGES, -1, 4 * num_classes), name='bbox_pred_reshape') + cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(rcnn_batch_size, -1, num_classes), name='cls_prob_reshape') + bbox_pred = mx.symbol.Reshape(data=bbox_pred, shape=(rcnn_batch_size, -1, 4 * num_classes), name='bbox_pred_reshape') # group output group = mx.symbol.Group([rois, cls_prob, bbox_pred]) diff --git a/example/rcnn/symnet/symbol_vgg.py b/example/rcnn/symnet/symbol_vgg.py new file mode 100644 index 000000000000..ff15bc58f1ed --- /dev/null +++ b/example/rcnn/symnet/symbol_vgg.py @@ -0,0 +1,232 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import mxnet as mx +from . import proposal_target + + +def get_vgg_feature(data): + # group 1 + conv1_1 = mx.symbol.Convolution( + data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, workspace=2048, name="conv1_1") + relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1") + conv1_2 = mx.symbol.Convolution( + data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, workspace=2048, name="conv1_2") + relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2") + pool1 = mx.symbol.Pooling( + data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1") + # group 2 + conv2_1 = mx.symbol.Convolution( + data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, workspace=2048, name="conv2_1") + relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1") + conv2_2 = mx.symbol.Convolution( + data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, workspace=2048, name="conv2_2") + relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2") + pool2 = mx.symbol.Pooling( + data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2") + # group 3 + conv3_1 = mx.symbol.Convolution( + data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_1") + relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1") + conv3_2 = mx.symbol.Convolution( + data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_2") + relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2") + conv3_3 = mx.symbol.Convolution( + data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_3") + relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3") + pool3 = mx.symbol.Pooling( + data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool3") + # group 4 + conv4_1 = mx.symbol.Convolution( + data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_1") + relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1") + conv4_2 = mx.symbol.Convolution( + data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_2") + relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2") + conv4_3 = mx.symbol.Convolution( + data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_3") + relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3") + pool4 = mx.symbol.Pooling( + data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4") + # group 5 + conv5_1 = mx.symbol.Convolution( + data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_1") + relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1") + conv5_2 = mx.symbol.Convolution( + data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_2") + relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2") + conv5_3 = mx.symbol.Convolution( + data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_3") + relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3") + + return relu5_3 + + +def get_vgg_top_feature(data): + # group 6 + flatten = mx.symbol.Flatten(data=data, name="flatten") + fc6 = mx.symbol.FullyConnected(data=flatten, num_hidden=4096, name="fc6") + relu6 = mx.symbol.Activation(data=fc6, act_type="relu", name="relu6") + drop6 = mx.symbol.Dropout(data=relu6, p=0.5, name="drop6") + # group 7 + fc7 = mx.symbol.FullyConnected(data=drop6, num_hidden=4096, name="fc7") + relu7 = mx.symbol.Activation(data=fc7, act_type="relu", name="relu7") + drop7 = mx.symbol.Dropout(data=relu7, p=0.5, name="drop7") + return drop7 + + + +def get_vgg_train(anchor_scales, anchor_ratios, rpn_feature_stride, + rpn_pre_topk, rpn_post_topk, rpn_nms_thresh, rpn_min_size, rpn_batch_rois, + num_classes, rcnn_feature_stride, rcnn_pooled_size, rcnn_batch_size, + rcnn_batch_rois, rcnn_fg_fraction, rcnn_fg_overlap, rcnn_bbox_stds): + num_anchors = len(anchor_scales) * len(anchor_ratios) + + data = mx.symbol.Variable(name="data") + im_info = mx.symbol.Variable(name="im_info") + gt_boxes = mx.symbol.Variable(name="gt_boxes") + rpn_label = mx.symbol.Variable(name='label') + rpn_bbox_target = mx.symbol.Variable(name='bbox_target') + rpn_bbox_weight = mx.symbol.Variable(name='bbox_weight') + + # shared convolutional layers + conv_feat = get_vgg_feature(data) + + # RPN layers + rpn_conv = mx.symbol.Convolution( + data=conv_feat, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3") + rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu") + + # rpn classification + rpn_cls_score = mx.symbol.Convolution( + data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score") + rpn_cls_score_reshape = mx.symbol.Reshape( + data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") + rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, label=rpn_label, multi_output=True, + normalization='valid', use_ignore=True, ignore_label=-1, name="rpn_cls_prob") + rpn_cls_act = mx.symbol.softmax( + data=rpn_cls_score_reshape, axis=1, name="rpn_cls_act") + rpn_cls_act_reshape = mx.symbol.Reshape( + data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape') + + # rpn bbox regression + rpn_bbox_pred = mx.symbol.Convolution( + data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") + rpn_bbox_loss_ = rpn_bbox_weight * mx.symbol.smooth_l1(name='rpn_bbox_loss_', scalar=3.0, data=(rpn_bbox_pred - rpn_bbox_target)) + rpn_bbox_loss = mx.sym.MakeLoss(name='rpn_bbox_loss', data=rpn_bbox_loss_, grad_scale=1.0 / rpn_batch_rois) + + # rpn proposal + rois = mx.symbol.contrib.MultiProposal( + cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', + feature_stride=rpn_feature_stride, scales=anchor_scales, ratios=anchor_ratios, + rpn_pre_nms_top_n=rpn_pre_topk, rpn_post_nms_top_n=rpn_post_topk, + threshold=rpn_nms_thresh, rpn_min_size=rpn_min_size) + + # rcnn roi proposal target + group = mx.symbol.Custom(rois=rois, gt_boxes=gt_boxes, op_type='proposal_target', + num_classes=num_classes, batch_images=rcnn_batch_size, + batch_rois=rcnn_batch_rois, fg_fraction=rcnn_fg_fraction, + fg_overlap=rcnn_fg_overlap, box_stds=rcnn_bbox_stds) + rois = group[0] + label = group[1] + bbox_target = group[2] + bbox_weight = group[3] + + # rcnn roi pool + roi_pool = mx.symbol.ROIPooling( + name='roi_pool', data=conv_feat, rois=rois, pooled_size=rcnn_pooled_size, spatial_scale=1.0 / rcnn_feature_stride) + + # rcnn top feature + top_feat = get_vgg_top_feature(roi_pool) + + # rcnn classification + cls_score = mx.symbol.FullyConnected(name='cls_score', data=top_feat, num_hidden=num_classes) + cls_prob = mx.symbol.SoftmaxOutput(name='cls_prob', data=cls_score, label=label, normalization='batch') + + # rcnn bbox regression + bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=top_feat, num_hidden=num_classes * 4) + bbox_loss_ = bbox_weight * mx.symbol.smooth_l1(name='bbox_loss_', scalar=1.0, data=(bbox_pred - bbox_target)) + bbox_loss = mx.sym.MakeLoss(name='bbox_loss', data=bbox_loss_, grad_scale=1.0 / rcnn_batch_rois) + + # reshape output + label = mx.symbol.Reshape(data=label, shape=(rcnn_batch_size, -1), name='label_reshape') + cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(rcnn_batch_size, -1, num_classes), name='cls_prob_reshape') + bbox_loss = mx.symbol.Reshape(data=bbox_loss, shape=(rcnn_batch_size, -1, 4 * num_classes), name='bbox_loss_reshape') + + # group output + group = mx.symbol.Group([rpn_cls_prob, rpn_bbox_loss, cls_prob, bbox_loss, mx.symbol.BlockGrad(label)]) + return group + + +def get_vgg_test(anchor_scales, anchor_ratios, rpn_feature_stride, + rpn_pre_topk, rpn_post_topk, rpn_nms_thresh, rpn_min_size, + num_classes, rcnn_feature_stride, rcnn_pooled_size, rcnn_batch_size): + num_anchors = len(anchor_scales) * len(anchor_ratios) + + data = mx.symbol.Variable(name="data") + im_info = mx.symbol.Variable(name="im_info") + + # shared convolutional layers + conv_feat = get_vgg_feature(data) + + # rpn feature + rpn_conv = mx.symbol.Convolution( + data=conv_feat, kernel=(3, 3), pad=(1, 1), num_filter=512, name="rpn_conv_3x3") + rpn_relu = mx.symbol.Activation(data=rpn_conv, act_type="relu", name="rpn_relu") + + # rpn classification + rpn_cls_score = mx.symbol.Convolution( + data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=2 * num_anchors, name="rpn_cls_score") + rpn_cls_score_reshape = mx.symbol.Reshape( + data=rpn_cls_score, shape=(0, 2, -1, 0), name="rpn_cls_score_reshape") + rpn_cls_act = mx.symbol.softmax( + data=rpn_cls_score_reshape, axis=1, name="rpn_cls_act") + rpn_cls_act_reshape = mx.symbol.Reshape( + data=rpn_cls_act, shape=(0, 2 * num_anchors, -1, 0), name='rpn_cls_act_reshape') + + # rpn bbox regression + rpn_bbox_pred = mx.symbol.Convolution( + data=rpn_relu, kernel=(1, 1), pad=(0, 0), num_filter=4 * num_anchors, name="rpn_bbox_pred") + + # rpn proposal + rois = mx.symbol.contrib.MultiProposal( + cls_prob=rpn_cls_act_reshape, bbox_pred=rpn_bbox_pred, im_info=im_info, name='rois', + feature_stride=rpn_feature_stride, scales=anchor_scales, ratios=anchor_ratios, + rpn_pre_nms_top_n=rpn_pre_topk, rpn_post_nms_top_n=rpn_post_topk, + threshold=rpn_nms_thresh, rpn_min_size=rpn_min_size) + + # rcnn roi pool + roi_pool = mx.symbol.ROIPooling( + name='roi_pool', data=conv_feat, rois=rois, pooled_size=rcnn_pooled_size, spatial_scale=1.0 / rcnn_feature_stride) + + # rcnn top feature + top_feat = get_vgg_top_feature(roi_pool) + + # rcnn classification + cls_score = mx.symbol.FullyConnected(name='cls_score', data=top_feat, num_hidden=num_classes) + cls_prob = mx.symbol.softmax(name='cls_prob', data=cls_score) + + # rcnn bbox regression + bbox_pred = mx.symbol.FullyConnected(name='bbox_pred', data=top_feat, num_hidden=num_classes * 4) + + # reshape output + cls_prob = mx.symbol.Reshape(data=cls_prob, shape=(rcnn_batch_size, -1, num_classes), name='cls_prob_reshape') + bbox_pred = mx.symbol.Reshape(data=bbox_pred, shape=(rcnn_batch_size, -1, 4 * num_classes), name='bbox_pred_reshape') + + # group output + group = mx.symbol.Group([rois, cls_prob, bbox_pred]) + return group diff --git a/example/rcnn/test.py b/example/rcnn/test.py index 2989bc02a4f7..3c047d222016 100644 --- a/example/rcnn/test.py +++ b/example/rcnn/test.py @@ -16,43 +16,211 @@ # under the License. import argparse +import ast +import pprint + import mxnet as mx -from rcnn.logger import logger -from rcnn.config import config, default, generate_config -from rcnn.tools.test_rcnn import test_rcnn +from mxnet.module import Module +import numpy as np +from tqdm import tqdm + +from symdata.bbox import im_detect +from symdata.loader import TestLoader +from symnet.logger import logger +from symnet.model import load_param, check_shape + + +def test_net(sym, imdb, args): + # print config + logger.info('called with args\n{}'.format(pprint.pformat(vars(args)))) + + # setup context + ctx = mx.gpu(args.gpu) + + # load testing data + test_data = TestLoader(imdb.roidb, batch_size=1, short=args.img_short_side, max_size=args.img_long_side, + mean=args.img_pixel_means, std=args.img_pixel_stds) + + # load params + arg_params, aux_params = load_param(args.params, ctx=ctx) + + # produce shape max possible + data_names = ['data', 'im_info'] + label_names = None + data_shapes = [('data', (1, 3, args.img_long_side, args.img_long_side)), ('im_info', (1, 3))] + label_shapes = None + + # check shapes + check_shape(sym, data_shapes, arg_params, aux_params) + + # create and bind module + mod = Module(sym, data_names, label_names, context=ctx) + mod.bind(data_shapes, label_shapes, for_training=False) + mod.init_params(arg_params=arg_params, aux_params=aux_params) + + # all detections are collected into: + # all_boxes[cls][image] = N x 5 array of detections in + # (x1, y1, x2, y2, score) + all_boxes = [[[] for _ in range(imdb.num_images)] + for _ in range(imdb.num_classes)] + + # start detection + with tqdm(total=imdb.num_images) as pbar: + for i, data_batch in enumerate(test_data): + # forward + im_info = data_batch.data[1][0] + mod.forward(data_batch) + rois, scores, bbox_deltas = mod.get_outputs() + rois = rois[:, 1:] + scores = scores[0] + bbox_deltas = bbox_deltas[0] + + det = im_detect(rois, scores, bbox_deltas, im_info, + bbox_stds=args.rcnn_bbox_stds, nms_thresh=args.rcnn_nms_thresh, + conf_thresh=args.rcnn_conf_thresh) + for j in range(1, imdb.num_classes): + indexes = np.where(det[:, 0] == j)[0] + all_boxes[j][i] = np.concatenate((det[:, -4:], det[:, [1]]), axis=-1)[indexes, :] + pbar.update(data_batch.data[0].shape[0]) + + # evaluate model + imdb.evaluate_detections(all_boxes) def parse_args(): - parser = argparse.ArgumentParser(description='Test a Faster R-CNN network') - # general - parser.add_argument('--network', help='network name', default=default.network, type=str) - parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) - args, rest = parser.parse_known_args() - generate_config(args.network, args.dataset) - parser.add_argument('--image_set', help='image_set name', default=default.test_image_set, type=str) - parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) - parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) - # testing - parser.add_argument('--prefix', help='model to test with', default=default.e2e_prefix, type=str) - parser.add_argument('--epoch', help='model to test with', default=default.e2e_epoch, type=int) - parser.add_argument('--gpu', help='GPU device to test with', default=0, type=int) - # rcnn - parser.add_argument('--vis', help='turn on visualization', action='store_true') - parser.add_argument('--thresh', help='valid detection threshold', default=1e-3, type=float) - parser.add_argument('--shuffle', help='shuffle data on visualization', action='store_true') - parser.add_argument('--has_rpn', help='generate proposals on the fly', action='store_true', default=True) - parser.add_argument('--proposal', help='can be ss for selective search or rpn', default='rpn', type=str) + parser = argparse.ArgumentParser(description='Test a Faster R-CNN network', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--network', type=str, default='vgg16', help='base network') + parser.add_argument('--params', type=str, default='', help='path to trained model') + parser.add_argument('--dataset', type=str, default='voc', help='training dataset') + parser.add_argument('--imageset', type=str, default='', help='imageset splits') + parser.add_argument('--gpu', type=int, default=0, help='gpu device eg. 0') + # faster rcnn params + parser.add_argument('--img-short-side', type=int, default=600) + parser.add_argument('--img-long-side', type=int, default=1000) + parser.add_argument('--img-pixel-means', type=str, default='(0.0, 0.0, 0.0)') + parser.add_argument('--img-pixel-stds', type=str, default='(1.0, 1.0, 1.0)') + parser.add_argument('--rpn-feat-stride', type=int, default=16) + parser.add_argument('--rpn-anchor-scales', type=str, default='(8, 16, 32)') + parser.add_argument('--rpn-anchor-ratios', type=str, default='(0.5, 1, 2)') + parser.add_argument('--rpn-pre-nms-topk', type=int, default=6000) + parser.add_argument('--rpn-post-nms-topk', type=int, default=300) + parser.add_argument('--rpn-nms-thresh', type=float, default=0.7) + parser.add_argument('--rpn-min-size', type=int, default=16) + parser.add_argument('--rcnn-num-classes', type=int, default=21) + parser.add_argument('--rcnn-feat-stride', type=int, default=16) + parser.add_argument('--rcnn-pooled-size', type=str, default='(14, 14)') + parser.add_argument('--rcnn-batch-size', type=int, default=1) + parser.add_argument('--rcnn-bbox-stds', type=str, default='(0.1, 0.1, 0.2, 0.2)') + parser.add_argument('--rcnn-nms-thresh', type=float, default=0.3) + parser.add_argument('--rcnn-conf-thresh', type=float, default=1e-3) args = parser.parse_args() + args.img_pixel_means = ast.literal_eval(args.img_pixel_means) + args.img_pixel_stds = ast.literal_eval(args.img_pixel_stds) + args.rpn_anchor_scales = ast.literal_eval(args.rpn_anchor_scales) + args.rpn_anchor_ratios = ast.literal_eval(args.rpn_anchor_ratios) + args.rcnn_pooled_size = ast.literal_eval(args.rcnn_pooled_size) + args.rcnn_bbox_stds = ast.literal_eval(args.rcnn_bbox_stds) return args +def get_voc(args): + from symimdb.pascal_voc import PascalVOC + if not args.imageset: + args.imageset = '2007_test' + args.rcnn_num_classes = len(PascalVOC.classes) + return PascalVOC(args.imageset, 'data', 'data/VOCdevkit') + + +def get_coco(args): + from symimdb.coco import coco + if not args.imageset: + args.imageset = 'val2017' + args.rcnn_num_classes = len(coco.classes) + return coco(args.imageset, 'data', 'data/coco') + + +def get_vgg16_test(args): + from symnet.symbol_vgg import get_vgg_test + if not args.params: + args.params = 'model/vgg16-0010.params' + args.img_pixel_means = (123.68, 116.779, 103.939) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.net_fixed_params = ['conv1', 'conv2'] + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (7, 7) + return get_vgg_test(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size) + + +def get_resnet50_test(args): + from symnet.symbol_resnet import get_resnet_test + if not args.params: + args.params = 'model/resnet50-0010.params' + args.img_pixel_means = (0.0, 0.0, 0.0) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (14, 14) + return get_resnet_test(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size, + units=(3, 4, 6, 3), filter_list=(256, 512, 1024, 2048)) + + +def get_resnet101_test(args): + from symnet.symbol_resnet import get_resnet_test + if not args.params: + args.params = 'model/resnet101-0010.params' + args.img_pixel_means = (0.0, 0.0, 0.0) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (14, 14) + return get_resnet_test(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size, + units=(3, 4, 23, 3), filter_list=(256, 512, 1024, 2048)) + + +def get_dataset(dataset, args): + datasets = { + 'voc': get_voc, + 'coco': get_coco + } + if dataset not in datasets: + raise ValueError("dataset {} not supported".format(dataset)) + return datasets[dataset](args) + + +def get_network(network, args): + networks = { + 'vgg16': get_vgg16_test, + 'resnet50': get_resnet50_test, + 'resnet101': get_resnet101_test + } + if network not in networks: + raise ValueError("network {} not supported".format(network)) + return networks[network](args) + + def main(): args = parse_args() - logger.info('Called with argument: %s' % args) - ctx = mx.gpu(args.gpu) - test_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, - ctx, args.prefix, args.epoch, - args.vis, args.shuffle, args.has_rpn, args.proposal, args.thresh) + imdb = get_dataset(args.dataset, args) + sym = get_network(args.network, args) + test_net(sym, imdb, args) + if __name__ == '__main__': main() diff --git a/example/rcnn/train.py b/example/rcnn/train.py new file mode 100644 index 000000000000..0739069afb4a --- /dev/null +++ b/example/rcnn/train.py @@ -0,0 +1,303 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +import ast +import pprint + +import mxnet as mx +from mxnet.module import Module + +from symdata.loader import AnchorGenerator, AnchorSampler, AnchorLoader +from symnet.logger import logger +from symnet.model import load_param, infer_data_shape, check_shape, initialize_frcnn, get_fixed_params +from symnet.metric import RPNAccMetric, RPNLogLossMetric, RPNL1LossMetric, RCNNAccMetric, RCNNLogLossMetric, RCNNL1LossMetric + + +def train_net(sym, roidb, args): + # print config + logger.info('called with args\n{}'.format(pprint.pformat(vars(args)))) + + # setup multi-gpu + ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] + batch_size = args.rcnn_batch_size * len(ctx) + + # load training data + feat_sym = sym.get_internals()['rpn_cls_score_output'] + ag = AnchorGenerator(feat_stride=args.rpn_feat_stride, + anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios) + asp = AnchorSampler(allowed_border=args.rpn_allowed_border, batch_rois=args.rpn_batch_rois, + fg_fraction=args.rpn_fg_fraction, fg_overlap=args.rpn_fg_overlap, + bg_overlap=args.rpn_bg_overlap) + train_data = AnchorLoader(roidb, batch_size, args.img_short_side, args.img_long_side, + args.img_pixel_means, args.img_pixel_stds, feat_sym, ag, asp, shuffle=True) + + # produce shape max possible + _, out_shape, _ = feat_sym.infer_shape(data=(1, 3, args.img_long_side, args.img_long_side)) + feat_height, feat_width = out_shape[0][-2:] + rpn_num_anchors = len(args.rpn_anchor_scales) * len(args.rpn_anchor_ratios) + data_names = ['data', 'im_info', 'gt_boxes'] + label_names = ['label', 'bbox_target', 'bbox_weight'] + data_shapes = [('data', (batch_size, 3, args.img_long_side, args.img_long_side)), + ('im_info', (batch_size, 3)), + ('gt_boxes', (batch_size, 100, 5))] + label_shapes = [('label', (batch_size, 1, rpn_num_anchors * feat_height, feat_width)), + ('bbox_target', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width)), + ('bbox_weight', (batch_size, 4 * rpn_num_anchors, feat_height, feat_width))] + + # print shapes + data_shape_dict, out_shape_dict = infer_data_shape(sym, data_shapes + label_shapes) + logger.info('max input shape\n%s' % pprint.pformat(data_shape_dict)) + logger.info('max output shape\n%s' % pprint.pformat(out_shape_dict)) + + # load and initialize params + if args.resume: + arg_params, aux_params = load_param(args.resume) + else: + arg_params, aux_params = load_param(args.pretrained) + arg_params, aux_params = initialize_frcnn(sym, data_shapes, arg_params, aux_params) + + # check parameter shapes + check_shape(sym, data_shapes + label_shapes, arg_params, aux_params) + + # check fixed params + fixed_param_names = get_fixed_params(sym, args.net_fixed_params) + logger.info('locking params\n%s' % pprint.pformat(fixed_param_names)) + + # metric + rpn_eval_metric = RPNAccMetric() + rpn_cls_metric = RPNLogLossMetric() + rpn_bbox_metric = RPNL1LossMetric() + eval_metric = RCNNAccMetric() + cls_metric = RCNNLogLossMetric() + bbox_metric = RCNNL1LossMetric() + eval_metrics = mx.metric.CompositeEvalMetric() + for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: + eval_metrics.add(child_metric) + + # callback + batch_end_callback = mx.callback.Speedometer(batch_size, frequent=args.log_interval, auto_reset=False) + epoch_end_callback = mx.callback.do_checkpoint(args.save_prefix) + + # learning schedule + base_lr = args.lr + lr_factor = 0.1 + lr_epoch = [int(epoch) for epoch in args.lr_decay_epoch.split(',')] + lr_epoch_diff = [epoch - args.start_epoch for epoch in lr_epoch if epoch > args.start_epoch] + lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) + lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] + logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) + lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) + # optimizer + optimizer_params = {'momentum': 0.9, + 'wd': 0.0005, + 'learning_rate': lr, + 'lr_scheduler': lr_scheduler, + 'rescale_grad': (1.0 / batch_size), + 'clip_gradient': 5} + + # train + mod = Module(sym, data_names=data_names, label_names=label_names, + logger=logger, context=ctx, work_load_list=None, + fixed_param_names=fixed_param_names) + mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, + batch_end_callback=batch_end_callback, kvstore='device', + optimizer='sgd', optimizer_params=optimizer_params, + arg_params=arg_params, aux_params=aux_params, begin_epoch=args.start_epoch, num_epoch=args.epochs) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train Faster R-CNN network', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('--network', type=str, default='vgg16', help='base network') + parser.add_argument('--pretrained', type=str, default='', help='path to pretrained model') + parser.add_argument('--dataset', type=str, default='voc', help='training dataset') + parser.add_argument('--imageset', type=str, default='', help='imageset splits') + parser.add_argument('--gpus', type=str, default='0', help='gpu devices eg. 0,1') + parser.add_argument('--epochs', type=int, default=10, help='training epochs') + parser.add_argument('--lr', type=float, default=0.001, help='base learning rate') + parser.add_argument('--lr-decay-epoch', type=str, default='7', help='epoch to decay lr') + parser.add_argument('--resume', type=str, default='', help='path to last saved model') + parser.add_argument('--start-epoch', type=int, default=0, help='start epoch for resuming') + parser.add_argument('--log-interval', type=int, default=100, help='logging mini batch interval') + parser.add_argument('--save-prefix', type=str, default='', help='saving params prefix') + # faster rcnn params + parser.add_argument('--img-short-side', type=int, default=600) + parser.add_argument('--img-long-side', type=int, default=1000) + parser.add_argument('--img-pixel-means', type=str, default='(0.0, 0.0, 0.0)') + parser.add_argument('--img-pixel-stds', type=str, default='(1.0, 1.0, 1.0)') + parser.add_argument('--net-fixed-params', type=str, default='["conv0", "stage1", "gamma", "beta"]') + parser.add_argument('--rpn-feat-stride', type=int, default=16) + parser.add_argument('--rpn-anchor-scales', type=str, default='(8, 16, 32)') + parser.add_argument('--rpn-anchor-ratios', type=str, default='(0.5, 1, 2)') + parser.add_argument('--rpn-pre-nms-topk', type=int, default=12000) + parser.add_argument('--rpn-post-nms-topk', type=int, default=2000) + parser.add_argument('--rpn-nms-thresh', type=float, default=0.7) + parser.add_argument('--rpn-min-size', type=int, default=16) + parser.add_argument('--rpn-batch-rois', type=int, default=256) + parser.add_argument('--rpn-allowed-border', type=int, default=0) + parser.add_argument('--rpn-fg-fraction', type=float, default=0.5) + parser.add_argument('--rpn-fg-overlap', type=float, default=0.7) + parser.add_argument('--rpn-bg-overlap', type=float, default=0.3) + parser.add_argument('--rcnn-num-classes', type=int, default=21) + parser.add_argument('--rcnn-feat-stride', type=int, default=16) + parser.add_argument('--rcnn-pooled-size', type=str, default='(14, 14)') + parser.add_argument('--rcnn-batch-size', type=int, default=1) + parser.add_argument('--rcnn-batch-rois', type=int, default=128) + parser.add_argument('--rcnn-fg-fraction', type=float, default=0.25) + parser.add_argument('--rcnn-fg-overlap', type=float, default=0.5) + parser.add_argument('--rcnn-bbox-stds', type=str, default='(0.1, 0.1, 0.2, 0.2)') + args = parser.parse_args() + args.img_pixel_means = ast.literal_eval(args.img_pixel_means) + args.img_pixel_stds = ast.literal_eval(args.img_pixel_stds) + args.net_fixed_params = ast.literal_eval(args.net_fixed_params) + args.rpn_anchor_scales = ast.literal_eval(args.rpn_anchor_scales) + args.rpn_anchor_ratios = ast.literal_eval(args.rpn_anchor_ratios) + args.rcnn_pooled_size = ast.literal_eval(args.rcnn_pooled_size) + args.rcnn_bbox_stds = ast.literal_eval(args.rcnn_bbox_stds) + return args + + +def get_voc(args): + from symimdb.pascal_voc import PascalVOC + if not args.imageset: + args.imageset = '2007_trainval' + args.rcnn_num_classes = len(PascalVOC.classes) + + isets = args.imageset.split('+') + roidb = [] + for iset in isets: + imdb = PascalVOC(iset, 'data', 'data/VOCdevkit') + imdb.append_flipped_images() + roidb.extend(imdb.roidb) + return roidb + + +def get_coco(args): + from symimdb.coco import coco + if not args.imageset: + args.imageset = 'train2017' + args.rcnn_num_classes = len(coco.classes) + + isets = args.imageset.split('+') + roidb = [] + for iset in isets: + imdb = coco(iset, 'data', 'data/coco') + imdb.filter_roidb() + imdb.append_flipped_images() + roidb.extend(imdb.roidb) + return roidb + + +def get_vgg16_train(args): + from symnet.symbol_vgg import get_vgg_train + if not args.pretrained: + args.pretrained = 'model/vgg16-0000.params' + if not args.save_prefix: + args.save_prefix = 'model/vgg16' + args.img_pixel_means = (123.68, 116.779, 103.939) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.net_fixed_params = ['conv1', 'conv2'] + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (7, 7) + return get_vgg_train(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, rpn_batch_rois=args.rpn_batch_rois, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size, + rcnn_batch_rois=args.rcnn_batch_rois, rcnn_fg_fraction=args.rcnn_fg_fraction, + rcnn_fg_overlap=args.rcnn_fg_overlap, rcnn_bbox_stds=args.rcnn_bbox_stds) + + +def get_resnet50_train(args): + from symnet.symbol_resnet import get_resnet_train + if not args.pretrained: + args.pretrained = 'model/resnet-50-0000.params' + if not args.save_prefix: + args.save_prefix = 'model/resnet50' + args.img_pixel_means = (0.0, 0.0, 0.0) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.net_fixed_params = ['conv0', 'stage1', 'gamma', 'beta'] + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (14, 14) + return get_resnet_train(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, rpn_batch_rois=args.rpn_batch_rois, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size, + rcnn_batch_rois=args.rcnn_batch_rois, rcnn_fg_fraction=args.rcnn_fg_fraction, + rcnn_fg_overlap=args.rcnn_fg_overlap, rcnn_bbox_stds=args.rcnn_bbox_stds, + units=(3, 4, 6, 3), filter_list=(256, 512, 1024, 2048)) + + +def get_resnet101_train(args): + from symnet.symbol_resnet import get_resnet_train + if not args.pretrained: + args.pretrained = 'model/resnet-101-0000.params' + if not args.save_prefix: + args.save_prefix = 'model/resnet101' + args.img_pixel_means = (0.0, 0.0, 0.0) + args.img_pixel_stds = (1.0, 1.0, 1.0) + args.net_fixed_params = ['conv0', 'stage1', 'gamma', 'beta'] + args.rpn_feat_stride = 16 + args.rcnn_feat_stride = 16 + args.rcnn_pooled_size = (14, 14) + return get_resnet_train(anchor_scales=args.rpn_anchor_scales, anchor_ratios=args.rpn_anchor_ratios, + rpn_feature_stride=args.rpn_feat_stride, rpn_pre_topk=args.rpn_pre_nms_topk, + rpn_post_topk=args.rpn_post_nms_topk, rpn_nms_thresh=args.rpn_nms_thresh, + rpn_min_size=args.rpn_min_size, rpn_batch_rois=args.rpn_batch_rois, + num_classes=args.rcnn_num_classes, rcnn_feature_stride=args.rcnn_feat_stride, + rcnn_pooled_size=args.rcnn_pooled_size, rcnn_batch_size=args.rcnn_batch_size, + rcnn_batch_rois=args.rcnn_batch_rois, rcnn_fg_fraction=args.rcnn_fg_fraction, + rcnn_fg_overlap=args.rcnn_fg_overlap, rcnn_bbox_stds=args.rcnn_bbox_stds, + units=(3, 4, 23, 3), filter_list=(256, 512, 1024, 2048)) + + +def get_dataset(dataset, args): + datasets = { + 'voc': get_voc, + 'coco': get_coco + } + if dataset not in datasets: + raise ValueError("dataset {} not supported".format(dataset)) + return datasets[dataset](args) + + +def get_network(network, args): + networks = { + 'vgg16': get_vgg16_train, + 'resnet50': get_resnet50_train, + 'resnet101': get_resnet101_train + } + if network not in networks: + raise ValueError("network {} not supported".format(network)) + return networks[network](args) + + +def main(): + args = parse_args() + roidb = get_dataset(args.dataset, args) + sym = get_network(args.network, args) + train_net(sym, roidb, args) + + +if __name__ == '__main__': + main() diff --git a/example/rcnn/train_alternate.py b/example/rcnn/train_alternate.py deleted file mode 100644 index 715816087a61..000000000000 --- a/example/rcnn/train_alternate.py +++ /dev/null @@ -1,121 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import mxnet as mx - -from rcnn.logger import logger -from rcnn.config import config, default, generate_config -from rcnn.tools.train_rpn import train_rpn -from rcnn.tools.test_rpn import test_rpn -from rcnn.tools.train_rcnn import train_rcnn -from rcnn.utils.combine_model import combine_model - - -def alternate_train(args, ctx, pretrained, epoch, - rpn_epoch, rpn_lr, rpn_lr_step, - rcnn_epoch, rcnn_lr, rcnn_lr_step): - # basic config - begin_epoch = 0 - config.TRAIN.BG_THRESH_LO = 0.0 - - logger.info('########## TRAIN RPN WITH IMAGENET INIT') - train_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, - args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume, - ctx, pretrained, epoch, 'model/rpn1', begin_epoch, rpn_epoch, - train_shared=False, lr=rpn_lr, lr_step=rpn_lr_step) - - logger.info('########## GENERATE RPN DETECTION') - image_sets = [iset for iset in args.image_set.split('+')] - for image_set in image_sets: - test_rpn(args.network, args.dataset, image_set, args.root_path, args.dataset_path, - ctx[0], 'model/rpn1', rpn_epoch, - vis=False, shuffle=False, thresh=0) - - logger.info('########## TRAIN RCNN WITH IMAGENET INIT AND RPN DETECTION') - train_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, - args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume, - ctx, pretrained, epoch, 'model/rcnn1', begin_epoch, rcnn_epoch, - train_shared=False, lr=rcnn_lr, lr_step=rcnn_lr_step, proposal='rpn') - - logger.info('########## TRAIN RPN WITH RCNN INIT') - train_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, - args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume, - ctx, 'model/rcnn1', rcnn_epoch, 'model/rpn2', begin_epoch, rpn_epoch, - train_shared=True, lr=rpn_lr, lr_step=rpn_lr_step) - - logger.info('########## GENERATE RPN DETECTION') - image_sets = [iset for iset in args.image_set.split('+')] - for image_set in image_sets: - test_rpn(args.network, args.dataset, image_set, args.root_path, args.dataset_path, - ctx[0], 'model/rpn2', rpn_epoch, - vis=False, shuffle=False, thresh=0) - - logger.info('########## COMBINE RPN2 WITH RCNN1') - combine_model('model/rpn2', rpn_epoch, 'model/rcnn1', rcnn_epoch, 'model/rcnn2', 0) - - logger.info('########## TRAIN RCNN WITH RPN INIT AND DETECTION') - train_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, - args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume, - ctx, 'model/rcnn2', 0, 'model/rcnn2', begin_epoch, rcnn_epoch, - train_shared=True, lr=rcnn_lr, lr_step=rcnn_lr_step, proposal='rpn') - - logger.info('########## COMBINE RPN2 WITH RCNN2') - combine_model('model/rpn2', rpn_epoch, 'model/rcnn2', rcnn_epoch, 'model/final', 0) - - -def parse_args(): - parser = argparse.ArgumentParser(description='Train Faster R-CNN Network') - # general - parser.add_argument('--network', help='network name', default=default.network, type=str) - parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) - args, rest = parser.parse_known_args() - generate_config(args.network, args.dataset) - parser.add_argument('--image_set', help='image_set name', default=default.image_set, type=str) - parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) - parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) - # training - parser.add_argument('--frequent', help='frequency of logging', default=default.frequent, type=int) - parser.add_argument('--kvstore', help='the kv-store type', default=default.kvstore, type=str) - parser.add_argument('--work_load_list', help='work load for different devices', default=None, type=list) - parser.add_argument('--no_flip', help='disable flip images', action='store_true') - parser.add_argument('--no_shuffle', help='disable random shuffle', action='store_true') - parser.add_argument('--resume', help='continue training', action='store_true') - # alternate - parser.add_argument('--gpus', help='GPU device to train with', default='0', type=str) - parser.add_argument('--pretrained', help='pretrained model prefix', default=default.pretrained, type=str) - parser.add_argument('--pretrained_epoch', help='pretrained model epoch', default=default.pretrained_epoch, type=int) - parser.add_argument('--rpn_epoch', help='end epoch of rpn training', default=default.rpn_epoch, type=int) - parser.add_argument('--rpn_lr', help='base learning rate', default=default.rpn_lr, type=float) - parser.add_argument('--rpn_lr_step', help='learning rate steps (in epoch)', default=default.rpn_lr_step, type=str) - parser.add_argument('--rcnn_epoch', help='end epoch of rcnn training', default=default.rcnn_epoch, type=int) - parser.add_argument('--rcnn_lr', help='base learning rate', default=default.rcnn_lr, type=float) - parser.add_argument('--rcnn_lr_step', help='learning rate steps (in epoch)', default=default.rcnn_lr_step, type=str) - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - logger.info('Called with argument: %s' % args) - ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] - alternate_train(args, ctx, args.pretrained, args.pretrained_epoch, - args.rpn_epoch, args.rpn_lr, args.rpn_lr_step, - args.rcnn_epoch, args.rcnn_lr, args.rcnn_lr_step) - -if __name__ == '__main__': - main() diff --git a/example/rcnn/train_end2end.py b/example/rcnn/train_end2end.py deleted file mode 100644 index 24c658aeca4d..000000000000 --- a/example/rcnn/train_end2end.py +++ /dev/null @@ -1,195 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -import argparse -import pprint -import mxnet as mx -import numpy as np - -from rcnn.logger import logger -from rcnn.config import config, default, generate_config -from rcnn.symbol import * -from rcnn.core import callback, metric -from rcnn.core.loader import AnchorLoader -from rcnn.core.module import MutableModule -from rcnn.utils.load_data import load_gt_roidb, merge_roidb, filter_roidb -from rcnn.utils.load_model import load_param - - -def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, - lr=0.001, lr_step='5'): - # setup config - config.TRAIN.BATCH_IMAGES = 1 - config.TRAIN.BATCH_ROIS = 128 - config.TRAIN.END2END = True - config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED = True - - # load symbol - sym = eval('get_' + args.network + '_train')(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS) - feat_sym = sym.get_internals()['rpn_cls_score_output'] - - # setup multi-gpu - batch_size = len(ctx) - input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size - - # print config - logger.info(pprint.pformat(config)) - - # load dataset and prepare imdb for training - image_sets = [iset for iset in args.image_set.split('+')] - roidbs = [load_gt_roidb(args.dataset, image_set, args.root_path, args.dataset_path, - flip=not args.no_flip) - for image_set in image_sets] - roidb = merge_roidb(roidbs) - roidb = filter_roidb(roidb) - - # load training data - train_data = AnchorLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle, - ctx=ctx, work_load_list=args.work_load_list, - feat_stride=config.RPN_FEAT_STRIDE, anchor_scales=config.ANCHOR_SCALES, - anchor_ratios=config.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING) - - # infer max shape - max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] - max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) - max_data_shape.append(('gt_boxes', (input_batch_size, 100, 5))) - logger.info('providing maximum shape %s %s' % (max_data_shape, max_label_shape)) - - # infer shape - data_shape_dict = dict(train_data.provide_data + train_data.provide_label) - arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict) - arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) - out_shape_dict = dict(zip(sym.list_outputs(), out_shape)) - aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) - logger.info('output shape %s' % pprint.pformat(out_shape_dict)) - - # load and initialize params - if args.resume: - arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) - else: - arg_params, aux_params = load_param(pretrained, epoch, convert=True) - arg_params['rpn_conv_3x3_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_conv_3x3_weight']) - arg_params['rpn_conv_3x3_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_conv_3x3_bias']) - arg_params['rpn_cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_cls_score_weight']) - arg_params['rpn_cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_cls_score_bias']) - arg_params['rpn_bbox_pred_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['rpn_bbox_pred_weight']) - arg_params['rpn_bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['rpn_bbox_pred_bias']) - arg_params['cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['cls_score_weight']) - arg_params['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias']) - arg_params['bbox_pred_weight'] = mx.random.normal(0, 0.001, shape=arg_shape_dict['bbox_pred_weight']) - arg_params['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias']) - - # check parameter shapes - for k in sym.list_arguments(): - if k in data_shape_dict: - continue - assert k in arg_params, k + ' not initialized' - assert arg_params[k].shape == arg_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) - for k in sym.list_auxiliary_states(): - assert k in aux_params, k + ' not initialized' - assert aux_params[k].shape == aux_shape_dict[k], \ - 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) - - # create solver - fixed_param_prefix = config.FIXED_PARAMS - data_names = [k[0] for k in train_data.provide_data] - label_names = [k[0] for k in train_data.provide_label] - mod = MutableModule(sym, data_names=data_names, label_names=label_names, - logger=logger, context=ctx, work_load_list=args.work_load_list, - max_data_shapes=max_data_shape, max_label_shapes=max_label_shape, - fixed_param_prefix=fixed_param_prefix) - - # decide training params - # metric - rpn_eval_metric = metric.RPNAccMetric() - rpn_cls_metric = metric.RPNLogLossMetric() - rpn_bbox_metric = metric.RPNL1LossMetric() - eval_metric = metric.RCNNAccMetric() - cls_metric = metric.RCNNLogLossMetric() - bbox_metric = metric.RCNNL1LossMetric() - eval_metrics = mx.metric.CompositeEvalMetric() - for child_metric in [rpn_eval_metric, rpn_cls_metric, rpn_bbox_metric, eval_metric, cls_metric, bbox_metric]: - eval_metrics.add(child_metric) - # callback - batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=args.frequent, auto_reset=False) - means = np.tile(np.array(config.TRAIN.BBOX_MEANS), config.NUM_CLASSES) - stds = np.tile(np.array(config.TRAIN.BBOX_STDS), config.NUM_CLASSES) - epoch_end_callback = callback.do_checkpoint(prefix, means, stds) - # decide learning rate - base_lr = lr - lr_factor = 0.1 - lr_epoch = [int(epoch) for epoch in lr_step.split(',')] - lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] - lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) - lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] - logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) - lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) - # optimizer - optimizer_params = {'momentum': 0.9, - 'wd': 0.0005, - 'learning_rate': lr, - 'lr_scheduler': lr_scheduler, - 'rescale_grad': (1.0 / batch_size), - 'clip_gradient': 5} - - # train - mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, - batch_end_callback=batch_end_callback, kvstore=args.kvstore, - optimizer='sgd', optimizer_params=optimizer_params, - arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) - - -def parse_args(): - parser = argparse.ArgumentParser(description='Train Faster R-CNN network') - # general - parser.add_argument('--network', help='network name', default=default.network, type=str) - parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) - args, rest = parser.parse_known_args() - generate_config(args.network, args.dataset) - parser.add_argument('--image_set', help='image_set name', default=default.image_set, type=str) - parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) - parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) - # training - parser.add_argument('--frequent', help='frequency of logging', default=default.frequent, type=int) - parser.add_argument('--kvstore', help='the kv-store type', default=default.kvstore, type=str) - parser.add_argument('--work_load_list', help='work load for different devices', default=None, type=list) - parser.add_argument('--no_flip', help='disable flip images', action='store_true') - parser.add_argument('--no_shuffle', help='disable random shuffle', action='store_true') - parser.add_argument('--resume', help='continue training', action='store_true') - # e2e - parser.add_argument('--gpus', help='GPU device to train with', default='0', type=str) - parser.add_argument('--pretrained', help='pretrained model prefix', default=default.pretrained, type=str) - parser.add_argument('--pretrained_epoch', help='pretrained model epoch', default=default.pretrained_epoch, type=int) - parser.add_argument('--prefix', help='new model prefix', default=default.e2e_prefix, type=str) - parser.add_argument('--begin_epoch', help='begin epoch of training, use with resume', default=0, type=int) - parser.add_argument('--end_epoch', help='end epoch of training', default=default.e2e_epoch, type=int) - parser.add_argument('--lr', help='base learning rate', default=default.e2e_lr, type=float) - parser.add_argument('--lr_step', help='learning rate steps (in epoch)', default=default.e2e_lr_step, type=str) - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - logger.info('Called with argument: %s' % args) - ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] - train_net(args, ctx, args.pretrained, args.pretrained_epoch, args.prefix, args.begin_epoch, args.end_epoch, - lr=args.lr, lr_step=args.lr_step) - -if __name__ == '__main__': - main()