Dev lgbm (#147) (#152)

* initial restructure * thresholds on unet output * added gmean tta, experimented with thresholding (#125) * feature exractor and lightgbm * pipeline is running ok * tmp commit * lgbm ready for tests * tmp * faster nms and feature extraction * small fix * cleaning * Dev repo cleanup (#138) * initial restructure * clean structure (#126) * clean structure * correct readme * further cleaning * Dev apply transformer (#131) * clean structure * correct readme * further cleaning * resizer docstring * couple docstrings * make apply transformer, memory cache * fixes * postprocessing docstrings * fixes in PR * Dev repo cleanup (#132) * cleanup * remove src. * Dev clean tta (#134) * added resize padding, refactored inference pipelines * refactored piepliens * added color shift augmentation * reduced caching to just mask_resize * updated config * Dev-repo_cleanup models and losses docstrings (#135) * models and losses docstrings * small fixes in docstrings * resolve conflicts in with TTA PR (#137) * refactor in stream mode (#139) * hot fix of mask_postprocessing in tta with new make transformer * finishing merge * finishing merge v2 * finishing merge v3 * finishing merge v4 * tmp commit * lgbm train and evaluate pipelines run correctly * something is not yes * fix * working lgbm training with ugly train_mode=True * back to pipelines.py * small fix * preparing PR * preparing PR v2 * preparing PR v2 * fix * fix_2 * fix_3 * fix_4
neptune-ai · Jun 21, 2018 · 20e64ed · 20e64ed
1 parent 8e269de
commit 20e64ed
Show file tree

Hide file tree

Showing 12 changed files with 485 additions and 51 deletions.
diff --git a/neptune.yaml b/neptune.yaml
@@ -1,7 +1,7 @@
 project: YOUR_PROJECT_NAME
 
 name: mapping_challenge_open_solution
-tags: [solution_1]
+tags: [solution_5]
 
 metric:
   channel: 'Final Validation Score'
@@ -41,7 +41,6 @@ parameters:
   loader_mode: resize
   stream_mode: 0
 
-
 # General parameters
   image_h: 256
   image_w: 256
@@ -86,11 +85,10 @@ parameters:
 
 # Postprocessing
   threshold: 0.5
-  min_nuclei_size: 20
-  erosion_percentages: '[10,20,30]'
   erode_selem_size: 0
   dilate_selem_size: 2
   tta_aggregation_method: gmean
+  nms__iou_threshold: 0.5
 
 # Inference padding
   crop_image_h: 300
@@ -100,4 +98,18 @@ parameters:
   pad_method: 'replicate'
 
 #Neptune monitor
-  unet_outputs_to_plot: '["multichannel_map",]'
+  unet_outputs_to_plot: '["multichannel_map",]'
+
+#Scoring model
+  scoring_model: 'lgbm'
+  scoring_model__num_training_examples: 10000
+
+#LightGBM
+  lgbm__learning_rate: 0.001
+  lgbm__num_leaves: 10
+  lgbm__min_data: 50
+  lgbm__max_depth: 10
+  lgbm__number_of_trees: 100
+  lgbm__early_stopping: 5
+  lgbm__train_size: 0.7
+  lgbm__target: 'iou'
diff --git a/src/callbacks.py b/src/callbacks.py
@@ -13,7 +13,7 @@
 from .steps.utils import get_logger
 from .steps.pytorch.callbacks import NeptuneMonitor, ValidationMonitor
 from .utils import softmax, coco_evaluation, create_annotations, make_apply_transformer
-from .pipeline_config import CATEGORY_IDS, Y_COLUMNS_SCORING
+from .pipeline_config import CATEGORY_IDS, Y_COLUMNS_SCORING, CATEGORY_LAYERS
 
 logger = get_logger()
 
@@ -200,7 +200,7 @@ def _generate_prediction(self, cache_dirpath, outputs):
         output = pipeline.transform(data)
         y_pred = output['y_pred']
 
-        prediction = create_annotations(self.meta_valid, y_pred, logger, CATEGORY_IDS)
+        prediction = create_annotations(self.meta_valid, y_pred, logger, CATEGORY_IDS, CATEGORY_LAYERS)
         return prediction
 
 

diff --git a/src/loaders.py b/src/loaders.py
@@ -436,9 +436,9 @@ def _get_tta_data(self, i, row):
 
 
 class TestTimeAugmentationAggregator(BaseTransformer):
-    def __init__(self, method, nthreads):
+    def __init__(self, method, num_threads):
         self.method = method
-        self.nthreads = nthreads
+        self.num_threads = num_threads
 
     @property
     def agg_method(self):
@@ -456,7 +456,7 @@ def transform(self, images, tta_params, img_ids, **kwargs):
                                            img_ids=img_ids,
                                            agg_method=self.agg_method)
         unique_img_ids = set(img_ids)
-        threads = min(self.nthreads, len(unique_img_ids))
+        threads = min(self.num_threads, len(unique_img_ids))
         with mp.pool.ThreadPool(threads) as executor:
             averages_images = executor.map(_aggregate_augmentations, unique_img_ids)
         return {'aggregated_prediction': averages_images}

diff --git a/src/models.py b/src/models.py
@@ -4,13 +4,18 @@
 import torch.nn as nn
 from torch.autograd import Variable
 from torch import optim
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.externals import joblib
+from sklearn.ensemble import RandomForestRegressor
 
 from .callbacks import NeptuneMonitorSegmentation, ValidationMonitorSegmentation
 from .steps.pytorch.architectures.unet import UNet
 from .steps.pytorch.callbacks import CallbackList, TrainingMonitor, ModelCheckpoint, \
     ExperimentTiming, ExponentialLRScheduler, EarlyStopping
 from .steps.pytorch.models import Model
 from .steps.pytorch.validation import multiclass_segmentation_loss, DiceLoss
+from .steps.sklearn.models import LightGBM, make_transformer, SklearnRegressor
 from .utils import softmax
 from .unet_models import AlbuNet, UNet11, UNetVGG16, UNetResNet
 
@@ -159,9 +164,12 @@ def __init__(self, architecture_config, training_config, callbacks_config):
 class PyTorchUNetWeightedStream(BasePyTorchUNet):
     def __init__(self, architecture_config, training_config, callbacks_config):
         super().__init__(architecture_config, training_config, callbacks_config)
-        weighted_loss = partial(multiclass_weighted_cross_entropy,
-                                **get_loss_variables(**architecture_config['weighted_cross_entropy']))
-        loss = partial(mixed_dice_cross_entropy_loss, dice_weight=architecture_config['loss_weights']['dice_mask'],
+        weights_function = partial(get_weights, **architecture_config['weighted_cross_entropy'])
+        weighted_loss = partial(multiclass_weighted_cross_entropy, weights_function=weights_function)
+        dice_loss = partial(multiclass_dice_loss, excluded_classes=[0])
+        loss = partial(mixed_dice_cross_entropy_loss,
+                       dice_loss=dice_loss,
+                       dice_weight=architecture_config['loss_weights']['dice_mask'],
                        cross_entropy_weight=architecture_config['loss_weights']['bce_mask'],
                        cross_entropy_loss=weighted_loss,
                        **architecture_config['dice'])
@@ -201,6 +209,81 @@ def _transform(self, datagen, validation_datagen=None):
         self.model.train()
 
 
+class ScoringLightGBM(LightGBM):
+    def __init__(self, model_params, training_params, train_size, target):
+        self.train_size = train_size
+        self.target = target
+        self.feature_names = []
+        self.estimator = None
+        super().__init__(model_params, training_params)
+
+    def fit(self, features, **kwargs):
+        df_features = _convert_features_to_df(features)
+        train_data, val_data = train_test_split(df_features, train_size=self.train_size)
+        self.feature_names = list(df_features.columns.drop(self.target))
+        super().fit(X=train_data[self.feature_names],
+                    y=train_data[self.target],
+                    X_valid=val_data[self.feature_names],
+                    y_valid=val_data[self.target],
+                    feature_names=self.feature_names,
+                    categorical_features=[])
+        return self
+
+    def transform(self, features, **kwargs):
+        scores = []
+        for image_features in features:
+            image_scores = []
+            for layer_features in image_features:
+                if len(layer_features) > 0:
+                    layer_scores = super().transform(layer_features[self.feature_names])
+                    image_scores.append(list(layer_scores['prediction']))
+                else:
+                    image_scores.append([])
+            scores.append(image_scores)
+        return {'scores': scores}
+
+    def save(self, filepath):
+        joblib.dump((self.estimator, self.feature_names), filepath)
+
+    def load(self, filepath):
+        self.estimator, self.feature_names = joblib.load(filepath)
+
+
+class ScoringRandomForest(SklearnRegressor):
+    def __init__(self, train_size, target, **kwargs):
+        self.train_size = train_size
+        self.target = target
+        self.feature_names = []
+        self.estimator = RandomForestRegressor()
+
+    def fit(self, features, **kwargs):
+        df_features = _convert_features_to_df(features)
+        train_data, val_data = train_test_split(df_features, train_size=self.train_size)
+        self.feature_names = list(df_features.columns.drop(self.target))
+        super().fit(X=train_data[self.feature_names],
+                    y=train_data[self.target])
+        return self
+
+    def transform(self, features, **kwargs):
+        scores = []
+        for image_features in features:
+            image_scores = []
+            for layer_features in image_features:
+                if len(layer_features) > 0:
+                    layer_scores = super().transform(layer_features[self.feature_names])
+                    image_scores.append(list(layer_scores['prediction']))
+                else:
+                    image_scores.append([])
+            scores.append(image_scores)
+        return {'scores': scores}
+
+    def save(self, filepath):
+        joblib.dump((self.estimator, self.feature_names), filepath)
+
+    def load(self, filepath):
+        self.estimator, self.feature_names = joblib.load(filepath)
+
+
 def weight_regularization_unet(model, regularize, weight_decay_conv2d):
     if regularize:
         parameter_list = [{'params': model.parameters(), 'weight_decay': weight_decay_conv2d}]
@@ -369,3 +452,11 @@ def multiclass_dice_loss(output, target, smooth=0, activation='softmax', exclude
         class_target.data = class_target.data.float()
         loss += dice(output[:, class_nr, :, :], class_target)
     return loss
+
+
+def _convert_features_to_df(features):
+    df_features = []
+    for image_features in features:
+        for layer_features in image_features[1:]:
+            df_features.append(layer_features)
+    return pd.concat(df_features)
diff --git a/src/pipeline_config.py b/src/pipeline_config.py
@@ -12,8 +12,9 @@
 X_COLUMNS = ['file_path_image']
 Y_COLUMNS = ['file_path_mask_eroded_0_dilated_0']
 Y_COLUMNS_SCORING = ['ImageId']
-CATEGORY_IDS = [None, 100]
 SEED = 1234
+CATEGORY_IDS = [None, 100]
+CATEGORY_LAYERS = [1, 19]
 MEAN = [0.485, 0.456, 0.406]
 STD = [0.229, 0.224, 0.225]
 
@@ -121,15 +122,32 @@
                       'rotation': True,
                       'color_shift_runs': False},
     'tta_aggregator': {'method': params.tta_aggregation_method,
-                       'nthreads': params.num_threads
+                       'num_threads': params.num_threads
                        },
-    'dropper': {'min_size': params.min_nuclei_size},
     'postprocessor': {'mask_dilation': {'dilate_selem_size': params.dilate_selem_size
                                         },
                       'mask_erosion': {'erode_selem_size': params.erode_selem_size
                                        },
                       'prediction_crop': {'h_crop': params.crop_image_h,
                                           'w_crop': params.crop_image_w
                                           },
+                      'scoring_model': params.scoring_model,
+                      'lightGBM': {'model_params': {'learning_rate': params.lgbm__learning_rate,
+                                                    'boosting_type': 'gbdt',
+                                                    'objective': 'regression',
+                                                    'metric': 'regression_l2',
+                                                    'sub_feature': 1.0,
+                                                    'num_leaves': params.lgbm__num_leaves,
+                                                    'min_data': params.lgbm__min_data,
+                                                    'max_depth': params.lgbm__max_depth},
+                                   'training_params': {'number_boosting_rounds': params.lgbm__number_of_trees,
+                                                       'early_stopping_rounds': params.lgbm__early_stopping},
+                                   'train_size': params.lgbm__train_size,
+                                   'target': params.lgbm__target
+                                   },
+                      'random_forest': {'train_size': params.lgbm__train_size,
+                                        'target': params.lgbm__target},
+                      'nms': {'iou_threshold': params.nms__iou_threshold,
+                              'num_threads': params.num_threads},
                       }
 })