jlevy44 · jlevy44 · Mar 13, 2020 · Sep 25, 2019 · Sep 25, 2019 · Sep 25, 2019
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-<h1 align="center">Welcome to PathFlowAI 👋</h1>
+<h1 align="center">Welcome to PathFlowAI </h1>
 <p>
   <img alt="Version" src="https://img.shields.io/badge/version-0.1-blue.svg?cacheSeconds=2592000" />
   <a href="https://jlevy44.github.io/PathFlowAI/">
@@ -10,11 +10,11 @@
 
 ### 🏠 [Homepage](https://github.com/jlevy44/PathFlowAI)
 
-MedRxiv Manuscript: https://www.medrxiv.org/content/10.1101/19003897v1
+Published in the Proceedings of the Pacific Symposium for Biocomputing 2020, Manuscript: https://psb.stanford.edu/psb-online/proceedings/psb20/Levy.pdf
 
 ## Install
 
-First, install [openslide](https://openslide.org/download/).
+First, install [openslide](https://openslide.org/download/). Note: may need to install libiconv and shapely using conda. Will update with more installation information, please submit issues as well.
 
 ```sh
 pip install pathflowai

diff --git a/bin/install_apex b/bin/install_apex
@@ -1,5 +1,7 @@
 #!/bin/bash
 
+export TMPDIR=$HOME/tmp
+mkdir -p $TMPDIR
 rm -rf apex
 git clone https://github.com/NVIDIA/apex
 cd apex

diff --git a/bin/install_lightnet b/bin/install_lightnet
@@ -0,0 +1,11 @@
+#!/bin/bash
+#python -m lightnet download tiny-yolo
+#python -m lightnet download yolo
+rm -rf lightnet
+git clone https://gitlab.com/EAVISE/lightnet.git
+cd lightnet
+pip install .
+cd ..
+rm -rf lightnet
+#wget https://pjreddie.com/media/files/yolo.weights
+#wget https://pjreddie.com/media/files/tiny-yolo.weights
diff --git a/experimental/datasets.py b/experimental/datasets.py
@@ -0,0 +1,99 @@
+#
+#   Lightnet dataset that works with brambox annotations
+#   Copyright EAVISE
+#
+# https://eavise.gitlab.io/lightnet/_modules/lightnet/models/_dataset_brambox.html#BramboxDataset
+# https://eavise.gitlab.io/brambox/notes/02-getting_started.html#Loading-data
+
+import os
+import copy
+import logging
+from PIL import Image
+import numpy as np
+import lightnet.data as lnd
+from pathflowai.utils import load_sql_df
+import dask.array as da
+from os.path import join
+
+try:
+    import brambox as bb
+except ImportError:
+    bb = None
+
+__all__ = ['BramboxDataset']
+log = logging.getLogger(__name__)
+
+# ADD IMAGE ANNOTATION TRANSFORM
+# ADD TRAIN VAL TEST INFO
+
+class BramboxPathFlowDataset(lnd.Dataset):
+    """ Dataset for any brambox annotations.
+
+    Args:
+        annotations (dataframe): Dataframe containing brambox annotations
+        input_dimension (tuple): (width,height) tuple with default dimensions of the network
+        class_label_map (list): List of class_labels
+        identify (function, optional): Lambda/function to get image based of annotation filename or image id; Default **replace/add .png extension to filename/id**
+        img_transform (torchvision.transforms.Compose): Transforms to perform on the images
+        anno_transform (torchvision.transforms.Compose): Transforms to perform on the annotations
+
+    Note:
+        This dataset opens images with the Pillow library
+    """
+    def __init__(self, input_dir, patch_info_file, patch_size, annotations, input_dimension, class_label_map=None, identify=None, img_transform=None, anno_transform=None):
+        if bb is None:
+            raise ImportError('Brambox needs to be installed to use this dataset')
+        super().__init__(input_dimension)
+
+        self.annos = annotations
+        self.annos['ignore']=0
+        self.annos['class_label']=self.annos['class_label'].astype(int)#-1
+        print(self.annos['class_label'].unique())
+        #print(self.annos.shape)
+        self.keys = self.annos.image.cat.categories # stores unique patches
+        #print(self.keys)
+        self.img_tf = img_transform
+        self.anno_tf = anno_transform
+        self.patch_info=load_sql_df(patch_info_file, patch_size)
+        IDs=self.patch_info['ID'].unique()
+        self.slides = {slide:da.from_zarr(join(input_dir,'{}.zarr'.format(slide))) for slide in IDs}
+        self.id = lambda k: k.split('/')
+        # experiment
+        #self.annos['x_top_left'], self.annos['y_top_left']=self.annos['y_top_left'], self.annos['x_top_left']
+        self.annos['width'], self.annos['height']=self.annos['height'], self.annos['width']
+        # Add class_ids
+        if class_label_map is None:
+            log.warning(f'No class_label_map given, generating it by sorting unique class labels from data alphabetically, which is not always deterministic behaviour')
+            class_label_map = list(np.sort(self.annos.class_label.unique()))
+        self.annos['class_id'] = self.annos.class_label.map(dict((l, i) for i, l in enumerate(class_label_map)))
+
+    def __len__(self):
+        return len(self.keys)
+
+    @lnd.Dataset.resize_getitem
+    def __getitem__(self, index):
+        """ Get transformed image and annotations based of the index of ``self.keys``
+
+        Args:
+            index (int): index of the ``self.keys`` list containing all the image identifiers of the dataset.
+
+        Returns:
+            tuple: (transformed image, list of transformed brambox boxes)
+        """
+        if index >= len(self):
+            raise IndexError(f'list index out of range [{index}/{len(self)-1}]')
+
+        # Load
+        #print(self.keys[index])
+        ID,x,y,patch_size=self.id(self.keys[index])
+        x,y,patch_size=int(x),int(y),int(patch_size)
+        img = self.slides[ID][x:x+patch_size,y:y+patch_size].compute()#Image.open(self.id(self.keys[index]))
+        anno = bb.util.select_images(self.annos, [self.keys[index]])
+
+        # Transform
+        if self.img_tf is not None:
+            img = self.img_tf(img)
+        if self.anno_tf is not None:
+            anno = self.anno_tf(anno)
+
+        return img, anno
diff --git a/experimental/get_anchors.py b/experimental/get_anchors.py
@@ -0,0 +1,23 @@
+from sklearn.cluster import KMeans
+import numpy as np, pandas as pd, brambox as bb
+import pickle, argparse
+
+p=argparse.ArgumentParser()
+p.add_argument('--patch_size',default=512,type=int)
+p.add_argument('--n_anchors',default=20,type=int)
+p.add_argument('--sample_p',default=1.,type=float)
+
+args=p.parse_args()
+np.random.seed(42)
+patch_size=args.patch_size
+n_anchors=args.n_anchors
+sample_p=args.sample_p
+annotation_file = 'annotations_bbox_{}.pkl'.format(patch_size)
+annotations=bb.io.load('pandas',annotation_file)
+if sample_p<1.:
+    annotations=annotations.sample(frac=sample_p)
+
+X=annotations[['x_top_left','y_top_left']].astype(float).values+(annotations['width']/2.).astype(float).values.reshape(-1,1)
+km=KMeans(n_clusters=n_anchors,n_jobs=-1).fit(X)
+anchors=km.cluster_centers_
+pickle.dump(anchors,open('anchors.pkl','wb'))
diff --git a/experimental/get_bounding_boxes_from_seg_point_masks.py b/experimental/get_bounding_boxes_from_seg_point_masks.py
@@ -0,0 +1,137 @@
+import brambox as bb
+import os
+from os.path import join, basename
+from pathflowai.utils import load_sql_df, npy2da
+import skimage
+import dask, dask.array as da, pandas as pd, numpy as np
+import argparse
+from scipy import ndimage
+from scipy.ndimage.measurements import label
+import pickle
+from dask.distributed import Client
+from multiprocessing import Pool
+from functools import reduce
+
+def get_box(l,prop):
+    c=[prop.centroid[1], prop.centroid[0]]
+    # l=rev_label[i+1]
+    width = prop.bbox[3] - prop.bbox[1] + 1
+    height = prop.bbox[2] - prop.bbox[0] + 1
+    wh=max(width,height)
+    # c = [ci-wh/2 for ci in c]
+    return [l]+c+[wh]
+
+def get_boxes(m,ID='test',x='x',y='y',patch_size='patchsize', num_classes=3):
+    lbls,n_lbl=label(m)
+    obj_labels={}
+    for i in range(1,num_classes+1):
+        obj_labels[i]=np.unique(lbls[m==i].flatten())
+    rev_label={}
+    for k in obj_labels:
+        for i in obj_labels[k]:
+            rev_label[i]=k
+    rev_label={k:rev_label[k] for k in sorted(list(rev_label.keys()))}
+    objProps = list(skimage.measure.regionprops(lbls))
+    #print(len(objProps),len(rev_label))
+    boxes=dask.compute(*[dask.delayed(get_box)(rev_label[i],objProps[i-1]) for i in list(rev_label.keys())],scheduler='threading') # [get_box(rev_label[i],objProps[i-1]) for i in list(rev_label.keys())]#
+    #print(boxes)
+    boxes=pd.DataFrame(np.array(boxes).astype(int),columns=['class_label','x_top_left','y_top_left','width'])
+
+    #boxes['class_label']=m[boxes[['x_top_left','y_top_left']].values.T.tolist()]
+    boxes['height']=boxes['width']
+    boxes['image']='{}/{}/{}/{}'.format(ID,x,y,patch_size)
+    boxes=boxes[['image','class_label','x_top_left','y_top_left','width','height']]
+    boxes.loc[:,'x_top_left']=np.clip(boxes.loc[:,'x_top_left'],0,m.shape[1])
+    boxes.loc[:,'y_top_left']=np.clip(boxes.loc[:,'y_top_left'],0,m.shape[0])
+
+    bbox_df=bb.util.new('annotation').drop(columns=['difficult','ignore','lost','occluded','truncated'])[['image','class_label','x_top_left','y_top_left','width','height']]
+    bbox_df=bbox_df.append(boxes)
+    #print(boxes)
+    return boxes
+
+if __name__=='__main__':
+    p=argparse.ArgumentParser()
+    p.add_argument('--num_classes',default=4,type=int)
+    p.add_argument('--patch_size',default=512,type=int)
+    p.add_argument('--n_workers',default=40,type=int)
+    p.add_argument('--p_sample',default=0.7,type=float)
+    p.add_argument('--input_dir',default='inputs',type=str)
+    p.add_argument('--patch_info_file',default='cell_info.db',type=str)
+    p.add_argument('--reference_mask',default='reference_mask.npy',type=str)
+    #c=Client()
+    # add mode to just use own extracted boudning boxes or from seg, maybe from histomicstk
+
+    args=p.parse_args()
+    num_classes=args.num_classes
+    n_workers=args.n_workers
+    input_dir=args.input_dir
+    patch_info_file=args.patch_info_file
+    patch_size=args.patch_size
+    p_sample=args.p_sample
+    np.random.seed(42)
+    annotation_file = 'annotations_bbox_{}.pkl'.format(patch_size)
+    reference_mask=args.reference_mask
+    if not os.path.exists('widths.pkl'):
+        m=np.load(reference_mask)
+        bbox_df=get_boxes(m)
+        official_widths=dict(bbox_df.groupby('class_label')['width'].mean()+2*bbox_df.groupby('class_label')['width'].std())
+        pickle.dump(official_widths,open('widths.pkl','wb'))
+    else:
+        official_widths=pickle.load(open('widths.pkl','rb'))
+
+    patch_info=load_sql_df(patch_info_file, patch_size)
+    IDs=patch_info['ID'].unique()
+    #slides = {slide:da.from_zarr(join(input_dir,'{}.zarr'.format(slide))) for slide in IDs}
+    masks = {mask:npy2da(join(input_dir,'{}_mask.npy'.format(mask))) for mask in IDs}
+
+    if p_sample < 1.:
+        patch_info=patch_info.sample(frac=p_sample)
+
+    if not os.path.exists(annotation_file):
+        bbox_df=bb.util.new('annotation').drop(columns=['difficult','ignore','lost','occluded','truncated'])[['image','class_label','x_top_left','y_top_left','width','height']]
+    else:
+        bbox_df=bb.io.load('pandas',annotation_file)
+
+    patch_info=patch_info[~np.isin(np.vectorize(lambda i: '/'.join(patch_info.iloc[i][['ID','x','y','patch_size']].astype(str).tolist()))(np.arange(patch_info.shape[0])),set(bbox_df.image.cat.categories))]
+
+    print(patch_info.shape[0])
+
+    def get_boxes_point_seg(m,ID,x,y,patch_size2,num_classes):
+        bbox_dff=get_boxes(m,ID=ID,x=x,y=y,patch_size=patch_size2, num_classes=num_classes)
+        for i in official_widths.keys():
+            bbox_dff.loc[bbox_dff['class_label']==i,'width']=int(official_widths[i])
+        bbox_dff.loc[:,'x_top_left']=(bbox_dff.loc[:,'x_top_left']-bbox_dff['width']/2.).astype(int)
+        bbox_dff.loc[:,'y_top_left']=(bbox_dff.loc[:,'y_top_left']-bbox_dff['width']/2.).astype(int)
+        bbox_dff.loc[:,'x_top_left']=np.clip(bbox_dff.loc[:,'x_top_left'],0,m.shape[1])
+        bbox_dff.loc[:,'y_top_left']=np.clip(bbox_dff.loc[:,'y_top_left'],0,m.shape[0])
+        return bbox_dff
+
+    def process_chunk(patch_info_sub):
+        patch_info_sub=patch_info_sub.reset_index(drop=True)
+        bbox_dfs=[]
+
+        for i in range(patch_info_sub.shape[0]):
+            #print(i)
+            patch=patch_info_sub.iloc[i]
+            ID,x,y,patch_size2=patch[['ID','x','y','patch_size']].tolist()
+            m=masks[ID][x:x+patch_size2,y:y+patch_size2]
+            bbox_dff=get_boxes_point_seg(m,ID,x,y,patch_size2,num_classes)#dask.delayed(get_boxes_point_seg)(m,ID,x,y,patch_size2)
+            #print(bbox_dff)
+            bbox_dfs.append(bbox_dff)
+        return bbox_dfs
+
+    patch_info_subs=np.array_split(patch_info,n_workers)
+
+    p=Pool(n_workers)
+
+    bbox_dfs=reduce(lambda x,y:x+y,p.map(process_chunk,patch_info_subs))
+
+    #bbox_dfs=dask.compute(*bbox_dfs,scheduler='processes')
+
+    bbox_df=pd.concat([bbox_df]+bbox_dfs)
+
+
+    bbox_df.loc[:,'height']=bbox_df['width']
+
+
+    bb.io.save(bbox_df,'pandas',annotation_file)
diff --git a/experimental/get_counts.py b/experimental/get_counts.py
@@ -0,0 +1,73 @@
+import brambox as bb
+import os
+from os.path import join, basename
+from pathflowai.utils import load_sql_df, npy2da, df2sql
+import skimage
+import dask, dask.array as da, pandas as pd, numpy as np
+import argparse
+from scipy import ndimage
+from scipy.ndimage.measurements import label
+import pickle
+from dask.distributed import Client
+from multiprocessing import Pool
+from functools import reduce
+
+def count_cells(m, num_classes=3):
+    lbls,n_lbl=label(m)
+    obj_labels=np.zeros(num_classes)
+    for i in range(1,num_classes+1):
+        obj_labels[i-1]=len(np.unique(lbls[m==i].flatten()))
+    return obj_labels
+
+if __name__=='__main__':
+    p=argparse.ArgumentParser()
+    p.add_argument('--num_classes',default=4,type=int)
+    p.add_argument('--patch_size',default=512,type=int)
+    p.add_argument('--n_workers',default=40,type=int)
+    p.add_argument('--p_sample',default=0.7,type=float)
+    p.add_argument('--input_dir',default='inputs',type=str)
+    p.add_argument('--patch_info_file',default='cell_info.db',type=str)
+    p.add_argument('--reference_mask',default='reference_mask.npy',type=str)
+    #c=Client()
+    # add mode to just use own extracted boudning boxes or from seg, maybe from histomicstk
+
+    args=p.parse_args()
+    num_classes=args.num_classes
+    n_workers=args.n_workers
+    input_dir=args.input_dir
+    patch_info_file=args.patch_info_file
+    patch_size=args.patch_size
+    np.random.seed(42)
+    reference_mask=args.reference_mask
+
+    patch_info=load_sql_df(patch_info_file, patch_size)
+    IDs=patch_info['ID'].unique()
+    #slides = {slide:da.from_zarr(join(input_dir,'{}.zarr'.format(slide))) for slide in IDs}
+    masks = {mask:npy2da(join(input_dir,'{}_mask.npy'.format(mask))) for mask in IDs}
+
+    def process_chunk(patch_info_sub):
+        patch_info_sub=patch_info_sub.reset_index(drop=True)
+        counts=[]
+        for i in range(patch_info_sub.shape[0]):
+            #print(i)
+            patch=patch_info_sub.iloc[i]
+            ID,x,y,patch_size2=patch[['ID','x','y','patch_size']].tolist()
+            m=masks[ID][x:x+patch_size2,y:y+patch_size2]
+            counts.append(dask.delayed(count_cells)(m, num_classes=num_classes))
+
+        return dask.compute(*counts,scheduler='threading')
+
+    patch_info_subs=np.array_split(patch_info,n_workers)
+
+    p=Pool(n_workers)
+
+    counts=reduce(lambda x,y:x+y,p.map(process_chunk,patch_info_subs))
+
+    #bbox_dfs=dask.compute(*bbox_dfs,scheduler='processes')
+
+    counts=pd.DataFrame(np.vstack(counts))
+
+    patch_info=pd.concat([patch_info[['ID','x','y','patch_size','annotation']].reset_index(drop=True),counts.reset_index(drop=True)],axis=1).reset_index()
+    print(patch_info)
+
+    df2sql(patch_info, 'counts_test.db', patch_size, mode='replace')