diff --git a/data/sql/estimation_procedure.sql b/data/sql/estimation_procedure.sql index 200878d41..7f69e908b 100644 --- a/data/sql/estimation_procedure.sql +++ b/data/sql/estimation_procedure.sql @@ -26,4 +26,5 @@ INSERT INTO `estimation_procedure` (`id`, `ttid`, `name`, `type`, `repeats`, `fo (25, 1, '4-fold Crossvalidation', 'crossvalidation', 1, 4, 'false', NULL, 'true', 'false', '2016-03-15 13:32:10'), (26, 1, 'Test on Training Data', 'testontrainingdata', NULL, NULL, 'false', NULL, NULL, 'false', '2019-03-16 11:30:14'), (27, 2, 'Test on Training Data', 'testontrainingdata', NULL, NULL, 'false', NULL, NULL, 'false', '2019-03-16 11:30:14'), -(28, 1, '20% Holdout (Ordered)', 'holdout_ordered', 1, 1, 'false', 20, NULL, 'false', '2019-05-23 12:40:53'); +(28, 1, '20% Holdout (Ordered)', 'holdout_ordered', 1, 1, 'false', 20, NULL, 'false', '2019-05-23 12:40:53'), +(29, 9, '10-fold Crossvalidation', 'crossvalidation', 1, 10, 'false', NULL, 'true', 'false', '2014-12-31 20:00:00'); diff --git a/data/sql/math_function.sql b/data/sql/math_function.sql index 9287b920a..3bb582e98 100644 --- a/data/sql/math_function.sql +++ b/data/sql/math_function.sql @@ -1,75 +1,75 @@ INSERT INTO `math_function` (`id`, `name`, `functionType`, `min`, `max`, `unit`, `higherIsBetter`, `description`, `source_code`, `date`) VALUES -(1, 'EuclideanDistance', 'Metric', '0', '', '', NULL, NULL, '', '2014-12-31 21:00:00'), -(2, 'PolynomialKernel', 'KernelFunction', '', '', '', NULL, NULL, '', '2014-12-31 21:00:00'), -(3, 'RBFKernel', 'KernelFunction', '', '', '', NULL, NULL, '', '2014-12-31 21:00:00'), -(4, 'area_under_roc_curve', 'EvaluationFunction', '0', '1', '', '1', 'The area under the ROC curve (AUROC), calculated using the Mann-Whitney U-test.\r\n\r\nThe curve is constructed by shifting the threshold for a positive prediction from 0 to 1, yielding a series of true positive rates (TPR) and false positive rates (FPR), from which a step-wise ROC curve can be constructed.\r\n\r\nSee http://en.wikipedia.org/wiki/Receiver_operating_characteristic\r\n\r\nNote that this is different from the Area Under the ROC Convex Hull (ROC AUCH).\r\n\r\nAUROC is defined only for a specific class value, and should thus be labeled with the class value for which is was computed. Use the mean_weighted_area_under_roc_curve for the weighted average over all class values.', 'See WEKA\'s ThresholdCurve class.', '2014-12-31 21:00:00'), -(5, 'average_cost', 'EvaluationFunction', '-Inf', 'Inf', '', '0', NULL, '', '2014-12-31 21:00:00'), -(6, 'binominal_test', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 09:43:24'), -(7, 'build_cpu_time', 'EvaluationFunction', '0', 'Inf', 'seconds', '0', 'The time in seconds to build a single model on all data.', '', '2014-12-31 21:00:00'), -(8, 'build_memory', 'EvaluationFunction', '0', 'Inf', 'bytes', '0', 'The memory, in bytes, needed to build a single model on all data.', '', '2014-12-31 21:00:00'), -(9, 'c_index', 'EvaluationFunction', '0', '0', '', '1', 'Used for survival Analysis', '', '2014-12-31 21:00:00'), -(10, 'chi-squared', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 09:43:24'), -(11, 'class_complexity', 'EvaluationFunction', '0', 'Inf', 'bits', '1', 'Entropy, in bits, of the class distribution generated by the model\'s predictions. Calculated by taking the sum of -log2(predictedProb) over all instances, where predictedProb is the probability (according to the model) of the actual class for that instance. If instances are weighted, the weighted sum is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(12, 'class_complexity_gain', 'EvaluationFunction', '-Inf', 'Inf', 'bits', '1', 'Entropy reduction, in bits, between the class distribution generated by the model\'s predictions, and the prior class distribution. Calculated by taking the difference of the prior_class_complexity and the class_complexity.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(13, 'confusion_matrix', 'EvaluationFunction', '', '', '', '1', 'The confusion matrix, or contingency table, is a table that summarizes the number of instances that were predicted to belong to a certain class, versus their actual class. It is an NxN matrix where N is the number of different class values, with the predicted classes in the columns and the actual classes in the rows. \r\n\r\nIn the case of 2 class values (positive and negative), the fields in the matrix are respectively, from left-to-right, top-to-bottom, the number of true positives (TP), false negatives (FN), false positives (FP) and true negatives (TN).\r\n\r\nThe number of correctly classified instances is the sum of diagonals in the matrix; all others are incorrectly classified (e.g. class ”a” gets misclassified as ”b”).\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Confusion_matrix\r\n\r\nThe values of the confusion matrix are each labeled with the actual and predicted class, e.g. \'actual=pos, predicted=neg\'.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(14, 'correlation_coefficient', 'EvaluationFunction', '-1', '1', '', '1', 'The sample Pearson correlation coefficient, or \'r\':\r\n\r\nr = \\frac{\\sum ^n _{i=1}(X_i - \\bar{X})(Y_i - \\bar{Y})}{\\sqrt{\\sum ^n _{i=1}(X_i - \\bar{X})^2} \\sqrt{\\sum ^n _{i=1}(Y_i - \\bar{Y})^2}}\r\n\r\nIt measures the correlation (linear dependence) between the actual predictions and the model\'s predictions, giving a value between +1 and ?1 inclusive.\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient', 'WEKA\'s Evaluation.correlationCoefficient()\r\n\r\n /**\r\n * Returns the correlation coefficient if the class is numeric.\r\n *\r\n * @return the correlation coefficient\r\n * @throws Exception if class is not numeric\r\n */\r\n public final double correlationCoefficient() throws Exception {\r\n\r\n if (m_ClassIsNominal) {\r\n throw\r\n new Exception(\"Can\'t compute correlation coefficient: \" + \r\n \"class is nominal!\");\r\n }\r\n\r\n double correlation = 0;\r\n double varActual = \r\n m_SumSqrClass - m_SumClass * m_SumClass / \r\n (m_WithClass - m_Unclassified);\r\n double varPredicted = \r\n m_SumSqrPredicted - m_SumPredicted * m_SumPredicted / \r\n (m_WithClass - m_Unclassified);\r\n double varProd = \r\n m_SumClassPredicted - m_SumClass * m_SumPredicted / \r\n (m_WithClass - m_Unclassified);\r\n\r\n if (varActual * varPredicted <= 0) {\r\n correlation = 0.0;\r\n } else {\r\n correlation = varProd / Math.sqrt(varActual * varPredicted);\r\n }\r\n\r\n return correlation;\r\n }\r\n', '2014-12-31 21:00:00'), -(15, 'cortana_quality', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 09:43:24'), -(16, 'coverage', 'EvaluationFunction', '0', 'inf', '', NULL, 'The number of observations in the current subgroup.', '', '2016-06-30 09:43:24'), -(17, 'f_measure', 'EvaluationFunction', '0', '0', '', '1', 'The F-Measure is the harmonic mean of precision and recall, also known as the the traditional F-measure, balanced F-score, or F1-score:\r\n\r\nFormula:\r\n2*Precision*Recall/(Precision+Recall)\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Precision_and_recall\r\n\r\nF-measure is defined only for a specific class value, and should thus be labeled with the class value for which is was computed. Use the mean_weighted_f_measure for the weighted average over all class values.', 'WEKA\'s Evaluation.fMeasure(int classIndex):\r\n\r\n /**\r\n * Calculate the F-Measure with respect to a particular class. \r\n * This is defined as

\r\n *

\r\n   * 2 * recall * precision\r\n   * ----------------------\r\n   *   recall + precision\r\n   * 
\r\n *\r\n * @param classIndex the index of the class to consider as \"positive\"\r\n * @return the F-Measure\r\n */\r\n public double fMeasure(int classIndex) {\r\n\r\n double precision = precision(classIndex);\r\n double recall = recall(classIndex);\r\n if ((precision + recall) == 0) {\r\n return 0;\r\n }\r\n return 2 * precision * recall / (precision + recall);\r\n }', '2014-12-31 21:00:00'), -(18, 'information_gain', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 09:43:24'), -(19, 'jaccard', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 09:43:24'), -(20, 'kappa', 'EvaluationFunction', '-1', '1', '', '1', 'Cohen\'s kappa coefficient is a statistical measure of agreement for qualitative (categorical) items: it measures the agreement of prediction with the true class – 1.0 signifies complete agreement. \r\n\r\nIt is generally thought to be a more robust measure than simple percent agreement calculation since kappa takes into account the agreement occurring by chance. However, some researchers have expressed concern over kappa\'s tendency to take the observed categories\' frequencies as givens, which can have the effect of underestimating agreement for a category that is also commonly used; for this reason, kappa is considered an overly conservative measure of agreement.\r\n\r\nThe equation for kappa is:\r\n\r\n\\kappa = \\frac{\\Pr(a) - \\Pr(e)}{1 - \\Pr(e)}, \\!\r\n\r\nwhere Pr(a) is the relative observed agreement among raters, and Pr(e) is the hypothetical probability of chance agreement, using the observed data to calculate the probabilities of each observer randomly saying each category. If the raters are in complete agreement then kappa = 1. If there is no agreement among the raters other than what would be expected by chance (as defined by Pr(e)), kappa = 0.\r\n\r\nSee: Cohen, Jacob (1960). A coefficient of agreement for nominal scales. Educational and Psychological Measurement 20 (1): 37–46.', 'WEKA\'s Evaluation.kappa(), based on the confusion matrix.\r\n\r\npublic final double kappa() {\r\n \r\n double[] sumRows = new double[m_ConfusionMatrix.length];\r\n double[] sumColumns = new double[m_ConfusionMatrix.length];\r\n double sumOfWeights = 0;\r\n for (int i = 0; i < m_ConfusionMatrix.length; i++) {\r\n for (int j = 0; j < m_ConfusionMatrix.length; j++) {\r\n sumRows[i] += m_ConfusionMatrix[i][j];\r\n sumColumns[j] += m_ConfusionMatrix[i][j];\r\n sumOfWeights += m_ConfusionMatrix[i][j];\r\n }\r\n }\r\n double correct = 0, chanceAgreement = 0;\r\n for (int i = 0; i < m_ConfusionMatrix.length; i++) {\r\n chanceAgreement += (sumRows[i] * sumColumns[i]);\r\n correct += m_ConfusionMatrix[i][i];\r\n }\r\n chanceAgreement /= (sumOfWeights * sumOfWeights);\r\n correct /= sumOfWeights;\r\n\r\n if (chanceAgreement < 1) {\r\n return (correct - chanceAgreement) / (1 - chanceAgreement);\r\n } else {\r\n return 1;\r\n }\r\n}', '2014-12-31 21:00:00'), -(21, 'kb_relative_information_score', 'EvaluationFunction', '-Inf', 'Inf', '', '1', 'The Kononenko and Bratko Information score, divided by the prior entropy of the class distribution.\r\n\r\nSee:\r\nKononenko, I., Bratko, I.: Information-based evaluation criterion for classi er\'s performance. Machine\r\nLearning 6 (1991) 67-80', '', '2014-12-31 21:00:00'), -(22, 'kohavi_wolpert_bias_squared', 'EvaluationFunction', '', '', '', '0', 'Bias component (squared) of the bias-variance decomposition as defined by Kohavi and Wolpert in:\r\n\r\nR. Kohavi & D. Wolpert (1996), Bias plus variance decomposition for zero-one loss functions, in Proc. of the Thirteenth International Machine Learning Conference (ICML96)\r\n\r\nThis quantity measures how closely\r\nthe learning algorithms average guess over all possible training sets of the given training set size matches the target.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 21:00:00'), -(23, 'kohavi_wolpert_error', 'EvaluationFunction', '', '', '', '0', 'Error rate measured in the bias-variance decomposition as defined by Kohavi and Wolpert in:\r\n\r\nR. Kohavi & D. Wolpert (1996), Bias plus variance decomposition for zero-one loss functions, in Proc. of the Thirteenth International Machine Learning Conference (ICML96)\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 21:00:00'), -(24, 'kohavi_wolpert_sigma_squared', 'EvaluationFunction', '', '', '', '0', 'Intrinsic error component (squared) of the bias-variance decomposition as defined by Kohavi and Wolpert in:\r\n\r\nR. Kohavi and D. Wolpert (1996), Bias plus variance decomposition for zero-one loss functions, in Proc. of the Thirteenth International Machine Learning Conference (ICML96)\r\n\r\nThis quantity is a lower bound on the expected cost of any learning algorithm. It is the expected cost of the Bayes optimal classi fier.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 21:00:00'), -(25, 'kohavi_wolpert_variance', 'EvaluationFunction', '', '', '', '0', 'Variance component of the bias-variance decomposition as defined by Kohavi and Wolpert in:\r\n\r\nR. Kohavi and D. Wolpert (1996), Bias plus variance decomposition for zero-one loss functions, in Proc. of the Thirteenth International Machine Learning Conference (ICML96)\r\n\r\nThis quantity measures how much the\r\nlearning algorithms guess \"bounces around\" for the different training sets of the given size.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 21:00:00'), -(26, 'kononenko_bratko_information_score', 'EvaluationFunction', '-Inf', 'Inf', '', '1', 'Kononenko and Bratko Information score. This measures predictive accuracy but eliminates the influence of prior probabilities.\r\n\r\nSee:\r\nKononenko, I., Bratko, I.: Information-based evaluation criterion for classi er\'s performance. Machine\r\nLearning 6 (1991) 67-80', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(27, 'matthews_correlation_coefficient', 'EvaluationFunction', '-1', '1', '', '1', 'The Matthews correlation coefficient takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes. The MCC is in essence a correlation coefficient between the observed and predicted binary classifications; it returns a value between ?1 and +1. A coefficient of +1 represents a perfect prediction, 0 no better than random prediction and ?1 indicates total disagreement between prediction and observation. The statistic is also known as the phi coefficient. MCC is related to the chi-square statistic for a 2×2 contingency table.\r\n\r\nThe MCC can be calculated directly from the confusion matrix using the formula:\r\n\r\n\r\n\\text{MCC} = \\frac{ TP \\times TN - FP \\times FN } {\\sqrt{ (TP + FP) ( TP + FN ) ( TN + FP ) ( TN + FN ) } }\r\n\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Matthews_correlation_coefficient\r\n', '', '2014-12-31 21:00:00'), -(28, 'mean_absolute_error', 'EvaluationFunction', '0', '1', '', '0', 'The mean absolute error (MAE) measures how close the model\'s predictions are to the actual target values. It is the sum of the absolute value of the difference of each instance prediction and the actual value. For classification, the 0/1-error is used.\r\n\r\n\\mathrm{MAE} = \\frac{1}{n}\\sum_{i=1}^n \\left| f_i-y_i\\right| =\\frac{1}{n}\\sum_{i=1}^n \\left| e_i \\right|.\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Mean_absolute_error', 'See WEKA\'s Evaluation class\r\n\r\n', '2014-12-31 21:00:00'), -(29, 'mean_class_complexity', 'EvaluationFunction', '0', 'Inf', '', '1', 'The entropy of the class distribution generated by the model (see class_complexity), divided by the number of instances in the input data.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(30, 'mean_class_complexity_gain', 'EvaluationFunction', '-Inf', 'Inf', '', '1', 'The entropy gain of the class distribution by the model over the prior distribution (see class_complexity_gain), divided by the number of instances in the input data.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(31, 'mean_f_measure', 'EvaluationFunction', '0', '1', '', '1', 'Unweighted(!) macro-average F-Measure. \r\n\r\nIn macro-averaging, F-measure is computed\r\nlocally over each category ?rst and then the average over all categories is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(32, 'mean_kononenko_bratko_information_score', 'EvaluationFunction', '-Inf', 'Inf', '', '1', 'Kononenko and Bratko Information score, see kononenko_bratko_information_score, divided by the number of instances in the input data.\r\n\r\nSee:\r\nKononenko, I., Bratko, I.: Information-based evaluation criterion for classi er\'s performance. Machine\r\nLearning 6 (1991) 67-80', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(33, 'mean_precision', 'EvaluationFunction', '0', '1', '', '1', 'Unweighted(!) macro-average Precision. \r\n\r\nIn macro-averaging, Precision is computed\r\nlocally over each category ?rst and then the average over all categories is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(34, 'mean_prior_absolute_error', 'EvaluationFunction', '0', '1', '', '0', 'The mean prior absolute error (MPAE) is the mean absolute error (see mean_absolute_error) of the prior (e.g., default class prediction).\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Mean_absolute_error', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(35, 'mean_prior_class_complexity', 'EvaluationFunction', '0', 'Inf', '', '1', 'The entropy of the class distribution of the prior (see prior_class_complexity), divided by the number of instances in the input data.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(36, 'mean_recall', 'EvaluationFunction', '0', '1', '', '1', 'Unweighted(!) macro-average Recall. \r\n\r\nIn macro-averaging, Recall is computed\r\nlocally over each category ?rst and then the average over all categories is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(37, 'mean_weighted_area_under_roc_curve', 'EvaluationFunction', '0', '1', '', '1', 'The macro weighted (by class size) average area_under_ROC_curve (AUROC). \r\n\r\nIn macro-averaging, AUROC is computed\r\nlocally over each category ?rst and then the average over all categories is taken, weighted by the number of instances of that class.\r\n\r\nConversely, in micro-averaging, AUROC is computed globally over all category decisions.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(38, 'mean_weighted_f_measure', 'EvaluationFunction', '0', '1', '', '1', 'The macro weighted (by class size) average F-Measure. \r\n\r\nIn macro-averaging, F-measure is computed\r\nlocally over each category ?rst and then the average over all categories is taken, weighted by the number of instances of that class.\r\n\r\nConversely, in micro-averaging, F-measure is computed globally over all category decisions.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(39, 'mean_weighted_precision', 'EvaluationFunction', '0', '1', '', '1', 'The macro weighted (by class size) average Precision. \r\n\r\nIn macro-averaging, Precision is computed\r\nlocally over each category ?rst and then the average over all categories is taken, weighted by the number of instances of that class.\r\n\r\nConversely, in micro-averaging, Precision is computed globally over all category decisions.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(40, 'mean_weighted_recall', 'EvaluationFunction', '0', '1', '', '1', 'The macro weighted (by class size) average Recall. \r\n\r\nIn macro-averaging, Recall is computed\r\nlocally over each category ?rst and then the average over all categories is taken, weighted by the number of instances of that class.\r\n\r\nConversely, in micro-averaging, Recall is computed globally over all category decisions.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(41, 'number_of_instances', 'EvaluationFunction', '0', 'inf', '', NULL, 'The number of instances used for this evaluation. ', '', '2014-12-31 21:00:00'), -(42, 'os_information', 'EvaluationFunction', '', '', '', '', 'Default information about OS, JVM, installations, etc. ', '', '2014-12-31 21:00:00'), -(43, 'positives', 'EvaluationFunction', '', '', '', NULL, 'The amount of positives in the subgroup', '', '2016-06-30 09:43:24'), -(44, 'precision', 'EvaluationFunction', '0', '0', '', '1', 'Precision is defined as the number of true positive (TP) predictions, divided by the sum of the number of true positives and false positives (TP+FP):\r\n\r\n\\text{Precision}=\\frac{tp}{tp+fp} \\, \r\n\r\nIt is also referred to as the Positive predictive value (PPV).\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Precision_and_recall\r\n\r\nPrecision is defined only for a specific class value, and should thus be labeled with the class value for which is was computed. Use the mean_weighted_precision for the weighted average over all class values.', 'WEKA\'s Evaluation.precision(int classIndex)\r\n\r\n /**\r\n * Calculate the precision with respect to a particular class. \r\n * This is defined as

\r\n *

\r\n   * correctly classified positives\r\n   * ------------------------------\r\n   *  total predicted as positive\r\n   * 
\r\n *\r\n * @param classIndex the index of the class to consider as \"positive\"\r\n * @return the precision\r\n */\r\n public double precision(int classIndex) {\r\n\r\n double correct = 0, total = 0;\r\n for (int i = 0; i < m_NumClasses; i++) {\r\n if (i == classIndex) {\r\n correct += m_ConfusionMatrix[i][classIndex];\r\n }\r\n total += m_ConfusionMatrix[i][classIndex];\r\n }\r\n if (total == 0) {\r\n return 0;\r\n }\r\n return correct / total;\r\n}', '2014-12-31 21:00:00'), -(45, 'predictive_accuracy', 'EvaluationFunction', '0', '1', '', '1', 'The Predictive Accuracy is the percentage of instances that are classified correctly. Is it 1 - ErrorRate.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(46, 'prior_class_complexity', 'EvaluationFunction', '0', 'Inf', 'bits', '1', 'Entropy, in bits, of the prior class distribution. Calculated by taking the sum of -log2(priorProb) over all instances, where priorProb is the prior probability of the actual class for that instance. If instances are weighted, the weighted sum is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(47, 'prior_entropy', 'EvaluationFunction', '0', 'Inf', 'bits', '1', 'Entropy, in bits, of the prior class distribution. Calculated by taking the sum of -log2(priorProb) over all instances, where priorProb is the prior probability of the actual class for that instance. If instances are weighted, the weighted sum is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(48, 'probability', 'EvaluationFunction', '', '', '', NULL, 'The probability of a subgroup.', '', '2016-06-30 09:43:24'), -(49, 'quality', 'EvaluationFunction', '', '', '', 'true', 'The quality of the founded subgroup', '', '2016-06-30 09:43:24'), -(50, 'ram_hours', 'EvaluationFunction', '0', 'Inf', 'GB RAM x hours', '0', 'Every GB of RAM deployed for 1 hour equals one RAM-Hour.', '', '2014-12-31 21:00:00'), -(51, 'recall', 'EvaluationFunction', '0', '0', '', '1', 'Recall is defined as the number of true positive (TP) predictions, divided by the sum of the number of true positives and false negatives (TP+FN):\r\n\r\n\\text{Recall}=\\frac{tp}{tp+fn} \\, \r\n\r\nIt is also referred to as the True Positive Rate (TPR) or Sensitivity.\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Precision_and_recall\r\n\r\nRecall is defined only for a specific class value, and should thus be labeled with the class value for which is was computed. Use the mean_weighted_recall for the weighted average over all class values.', 'WEKA\'s Evaluation.truePositiveRate(int classIndex):\r\n\r\n /**\r\n * Calculate the true positive rate with respect to a particular class. \r\n * This is defined as

\r\n *

\r\n   * correctly classified positives\r\n   * ------------------------------\r\n   *       total positives\r\n   * 
\r\n *\r\n * @param classIndex the index of the class to consider as \"positive\"\r\n * @return the true positive rate\r\n */\r\n public double truePositiveRate(int classIndex) {\r\n\r\n double correct = 0, total = 0;\r\n for (int j = 0; j < m_NumClasses; j++) {\r\n if (j == classIndex) {\r\n correct += m_ConfusionMatrix[classIndex][j];\r\n }\r\n total += m_ConfusionMatrix[classIndex][j];\r\n }\r\n if (total == 0) {\r\n return 0;\r\n }\r\n return correct / total;\r\n}', '2014-12-31 21:00:00'), -(52, 'relative_absolute_error', 'EvaluationFunction', '0', '1', '', '0', 'The Relative Absolute Error (RAE) is the mean absolute error (MAE) divided by the mean prior absolute error (MPAE).', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(53, 'root_mean_prior_squared_error', 'EvaluationFunction', '0', '1', '', '0', 'The Root Mean Prior Squared Error (RMPSE) is the Root Mean Squared Error (RMSE) of the prior (e.g., the default class prediction).', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(54, 'root_mean_squared_error', 'EvaluationFunction', '0', '1', '', '0', 'The Root Mean Squared Error (RMSE) measures how close the model\'s predictions are to the actual target values. It is the square root of the Mean Squared Error (MSE), the sum of the squared differences between the predicted value and the actual value. For classification, the 0/1-error is used.\r\n\r\n:\\operatorname{MSE}(\\overline{X})=\\operatorname{E}((\\overline{X}-\\mu)^2)=\\left(\\frac{\\sigma}{\\sqrt{n}}\\right)^2= \\frac{\\sigma^2}{n}\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Mean_squared_error', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(55, 'root_relative_squared_error', 'EvaluationFunction', '0', '1', '', '0', 'The Root Relative Squared Error (RRSE) is the Root Mean Squared Error (RMSE) divided by the Root Mean Prior Squared Error (RMPSE). See root_mean_squared_error and root_mean_prior_squared_error.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(56, 'run_cpu_time', 'EvaluationFunction', '0', 'Inf', 'seconds', '0', 'Runtime in seconds of the entire run. In the case of cross-validation runs, this will include all iterations.', '', '2014-12-31 21:00:00'), -(57, 'run_memory', 'EvaluationFunction', '0', 'Inf', 'bytes', '0', 'Amount of memory, in bytes, used during the entire run.', '', '2014-12-31 21:00:00'), -(58, 'run_virtual_memory', 'EvaluationFunction', '0', 'Inf', 'bytes', '0', 'Amount of virtual memory, in bytes, used during the entire run.', '', '2014-12-31 21:00:00'), -(59, 'scimark_benchmark', 'EvaluationFunction', '0', 'Inf', 'MFlops', '1', 'A benchmark tool which measures (single core) CPU performance on the JVM. ', 'See http://math.nist.gov/scimark2/', '2014-12-31 21:00:00'), -(60, 'single_point_area_under_roc_curve', 'EvaluationFunction', '0', '1', '', '1', NULL, '', '2014-12-31 21:00:00'), -(61, 'total_cost', 'EvaluationFunction', '-Inf', 'Inf', '', '0', NULL, '', '2014-12-31 21:00:00'), -(62, 'unclassified_instance_count', 'EvaluationFunction', '0', 'Inf', 'instances', '1', 'Number of instances that were not classified by the model.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 21:00:00'), -(63, 'usercpu_time_millis', 'EvaluationFunction', '0', 'Inf', 'milliseconds', '0', 'The time in milliseconds to build and test a single model on all data.', '', '2014-12-31 21:00:00'), -(64, 'usercpu_time_millis_testing', 'EvaluationFunction', '0', 'Inf', 'milliseconds', '0', 'The time in milliseconds to test a single model on all data.', '', '2014-12-31 21:00:00'), -(65, 'usercpu_time_millis_training', 'EvaluationFunction', '0', 'Inf', 'milliseconds', '0', 'The time in milliseconds to build a single model on all data.', '', '2014-12-31 21:00:00'), -(66, 'webb_bias', 'EvaluationFunction', '', '', '', '0', 'Bias component (squared) of the bias-variance decomposition as defined by Webb in:\r\n\r\nGeoffrey I. Webb (2000), MultiBoosting: A Technique for Combining Boosting and Wagging, Machine Learning, 40(2), pages 159-196.\r\n\r\nThis quantity measures how closely\r\nthe learning algorithms average guess over all possible training sets of the given training set size matches the target.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 21:00:00'), -(67, 'webb_error', 'EvaluationFunction', '', '', '', '0', 'Intrinsic error component (squared) of the bias-variance decomposition as defined by Webb in:\r\n\r\nGeoffrey I. Webb (2000), MultiBoosting: A Technique for Combining Boosting and Wagging, Machine Learning, 40(2), pages 159-196.\r\n\r\nThis quantity is a lower bound on the expected cost of any learning algorithm. It is the expected cost of the Bayes optimal classi fier.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 21:00:00'), -(68, 'webb_variance', 'EvaluationFunction', '', '', '', '0', 'Variance component of the bias-variance decomposition as defined by Webb in:\r\n\r\nGeoffrey I. Webb (2000), MultiBoosting: A Technique for Combining Boosting and Wagging, Machine Learning, 40(2), pages 159-196.\r\n\r\nThis quantity measures how much the\r\nlearning algorithms guess \"bounces around\" for the different training sets of the given size.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 21:00:00'), -(69, 'joint_entropy', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 09:43:24'), -(70, 'pattern_team_auroc10', 'EvaluationFunction', '', '', '', NULL, 'Area under the ROC curve for the 10 best subgroups', '', '2016-06-30 09:43:24'), -(71, 'wall_clock_time_millis', 'EvaluationFunction', '0', 'inf', 'millisecond', 'False', 'The number of milliseconds from the start of training until the completion of testing. Thus, involves both training and testing. Does not take into account the number of cores. ', '', '2018-08-15 16:26:51'), -(72, 'wall_clock_time_millis_training', 'EvaluationFunction', '0', 'Inf', 'millisecond', 'False', 'The number of milliseconds from the start of training until the completion of training. Does not take into account the number of cores. ', '', '2018-08-15 16:26:51'), -(73, 'wall_clock_time_millis_testing', 'EvaluationFunction', '0', 'Inf', 'millisecond', 'False', 'The number of milliseconds from the start of testing until the completion of testing. Does not take into account the number of cores. ', '', '2018-08-15 16:26:51'); - +(1, 'EuclideanDistance', 'Metric', '0', '', '', NULL, NULL, '', '2014-12-31 20:00:00'), +(2, 'PolynomialKernel', 'KernelFunction', '', '', '', NULL, NULL, '', '2014-12-31 20:00:00'), +(3, 'RBFKernel', 'KernelFunction', '', '', '', NULL, NULL, '', '2014-12-31 20:00:00'), +(4, 'area_under_roc_curve', 'EvaluationFunction', '0', '1', '', '1', 'The area under the ROC curve (AUROC), calculated using the Mann-Whitney U-test.\r\n\r\nThe curve is constructed by shifting the threshold for a positive prediction from 0 to 1, yielding a series of true positive rates (TPR) and false positive rates (FPR), from which a step-wise ROC curve can be constructed.\r\n\r\nSee http://en.wikipedia.org/wiki/Receiver_operating_characteristic\r\n\r\nNote that this is different from the Area Under the ROC Convex Hull (ROC AUCH).\r\n\r\nAUROC is defined only for a specific class value, and should thus be labeled with the class value for which is was computed. Use the mean_weighted_area_under_roc_curve for the weighted average over all class values.', 'See WEKA\'s ThresholdCurve class.', '2014-12-31 20:00:00'), +(5, 'average_cost', 'EvaluationFunction', '-Inf', 'Inf', '', '0', NULL, '', '2014-12-31 20:00:00'), +(6, 'binominal_test', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 07:43:24'), +(7, 'build_cpu_time', 'EvaluationFunction', '0', 'Inf', 'seconds', '0', 'The time in seconds to build a single model on all data.', '', '2014-12-31 20:00:00'), +(8, 'build_memory', 'EvaluationFunction', '0', 'Inf', 'bytes', '0', 'The memory, in bytes, needed to build a single model on all data.', '', '2014-12-31 20:00:00'), +(9, 'c_index', 'EvaluationFunction', '0', '0', '', '1', 'Used for survival Analysis', '', '2014-12-31 20:00:00'), +(10, 'chi-squared', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 07:43:24'), +(11, 'class_complexity', 'EvaluationFunction', '0', 'Inf', 'bits', '1', 'Entropy, in bits, of the class distribution generated by the model\'s predictions. Calculated by taking the sum of -log2(predictedProb) over all instances, where predictedProb is the probability (according to the model) of the actual class for that instance. If instances are weighted, the weighted sum is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(12, 'class_complexity_gain', 'EvaluationFunction', '-Inf', 'Inf', 'bits', '1', 'Entropy reduction, in bits, between the class distribution generated by the model\'s predictions, and the prior class distribution. Calculated by taking the difference of the prior_class_complexity and the class_complexity.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(13, 'confusion_matrix', 'EvaluationFunction', '', '', '', '1', 'The confusion matrix, or contingency table, is a table that summarizes the number of instances that were predicted to belong to a certain class, versus their actual class. It is an NxN matrix where N is the number of different class values, with the predicted classes in the columns and the actual classes in the rows. \r\n\r\nIn the case of 2 class values (positive and negative), the fields in the matrix are respectively, from left-to-right, top-to-bottom, the number of true positives (TP), false negatives (FN), false positives (FP) and true negatives (TN).\r\n\r\nThe number of correctly classified instances is the sum of diagonals in the matrix; all others are incorrectly classified (e.g. class ”a” gets misclassified as ”b”).\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Confusion_matrix\r\n\r\nThe values of the confusion matrix are each labeled with the actual and predicted class, e.g. \'actual=pos, predicted=neg\'.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(14, 'correlation_coefficient', 'EvaluationFunction', '-1', '1', '', '1', 'The sample Pearson correlation coefficient, or \'r\':\r\n\r\nr = \\frac{\\sum ^n _{i=1}(X_i - \\bar{X})(Y_i - \\bar{Y})}{\\sqrt{\\sum ^n _{i=1}(X_i - \\bar{X})^2} \\sqrt{\\sum ^n _{i=1}(Y_i - \\bar{Y})^2}}\r\n\r\nIt measures the correlation (linear dependence) between the actual predictions and the model\'s predictions, giving a value between +1 and ?1 inclusive.\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient', 'WEKA\'s Evaluation.correlationCoefficient()\r\n\r\n /**\r\n * Returns the correlation coefficient if the class is numeric.\r\n *\r\n * @return the correlation coefficient\r\n * @throws Exception if class is not numeric\r\n */\r\n public final double correlationCoefficient() throws Exception {\r\n\r\n if (m_ClassIsNominal) {\r\n throw\r\n new Exception(\"Can\'t compute correlation coefficient: \" + \r\n \"class is nominal!\");\r\n }\r\n\r\n double correlation = 0;\r\n double varActual = \r\n m_SumSqrClass - m_SumClass * m_SumClass / \r\n (m_WithClass - m_Unclassified);\r\n double varPredicted = \r\n m_SumSqrPredicted - m_SumPredicted * m_SumPredicted / \r\n (m_WithClass - m_Unclassified);\r\n double varProd = \r\n m_SumClassPredicted - m_SumClass * m_SumPredicted / \r\n (m_WithClass - m_Unclassified);\r\n\r\n if (varActual * varPredicted <= 0) {\r\n correlation = 0.0;\r\n } else {\r\n correlation = varProd / Math.sqrt(varActual * varPredicted);\r\n }\r\n\r\n return correlation;\r\n }\r\n', '2014-12-31 20:00:00'), +(15, 'cortana_quality', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 07:43:24'), +(16, 'coverage', 'EvaluationFunction', '0', 'inf', '', NULL, 'The number of observations in the current subgroup.', '', '2016-06-30 07:43:24'), +(17, 'f_measure', 'EvaluationFunction', '0', '0', '', '1', 'The F-Measure is the harmonic mean of precision and recall, also known as the the traditional F-measure, balanced F-score, or F1-score:\r\n\r\nFormula:\r\n2*Precision*Recall/(Precision+Recall)\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Precision_and_recall\r\n\r\nF-measure is defined only for a specific class value, and should thus be labeled with the class value for which is was computed. Use the mean_weighted_f_measure for the weighted average over all class values.', 'WEKA\'s Evaluation.fMeasure(int classIndex):\r\n\r\n /**\r\n * Calculate the F-Measure with respect to a particular class. \r\n * This is defined as

\r\n *

\r\n   * 2 * recall * precision\r\n   * ----------------------\r\n   *   recall + precision\r\n   * 
\r\n *\r\n * @param classIndex the index of the class to consider as \"positive\"\r\n * @return the F-Measure\r\n */\r\n public double fMeasure(int classIndex) {\r\n\r\n double precision = precision(classIndex);\r\n double recall = recall(classIndex);\r\n if ((precision + recall) == 0) {\r\n return 0;\r\n }\r\n return 2 * precision * recall / (precision + recall);\r\n }', '2014-12-31 20:00:00'), +(18, 'information_gain', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 07:43:24'), +(19, 'jaccard', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 07:43:24'), +(20, 'kappa', 'EvaluationFunction', '-1', '1', '', '1', 'Cohen\'s kappa coefficient is a statistical measure of agreement for qualitative (categorical) items: it measures the agreement of prediction with the true class – 1.0 signifies complete agreement. \r\n\r\nIt is generally thought to be a more robust measure than simple percent agreement calculation since kappa takes into account the agreement occurring by chance. However, some researchers have expressed concern over kappa\'s tendency to take the observed categories\' frequencies as givens, which can have the effect of underestimating agreement for a category that is also commonly used; for this reason, kappa is considered an overly conservative measure of agreement.\r\n\r\nThe equation for kappa is:\r\n\r\n\\kappa = \\frac{\\Pr(a) - \\Pr(e)}{1 - \\Pr(e)}, \\!\r\n\r\nwhere Pr(a) is the relative observed agreement among raters, and Pr(e) is the hypothetical probability of chance agreement, using the observed data to calculate the probabilities of each observer randomly saying each category. If the raters are in complete agreement then kappa = 1. If there is no agreement among the raters other than what would be expected by chance (as defined by Pr(e)), kappa = 0.\r\n\r\nSee: Cohen, Jacob (1960). A coefficient of agreement for nominal scales. Educational and Psychological Measurement 20 (1): 37–46.', 'WEKA\'s Evaluation.kappa(), based on the confusion matrix.\r\n\r\npublic final double kappa() {\r\n \r\n double[] sumRows = new double[m_ConfusionMatrix.length];\r\n double[] sumColumns = new double[m_ConfusionMatrix.length];\r\n double sumOfWeights = 0;\r\n for (int i = 0; i < m_ConfusionMatrix.length; i++) {\r\n for (int j = 0; j < m_ConfusionMatrix.length; j++) {\r\n sumRows[i] += m_ConfusionMatrix[i][j];\r\n sumColumns[j] += m_ConfusionMatrix[i][j];\r\n sumOfWeights += m_ConfusionMatrix[i][j];\r\n }\r\n }\r\n double correct = 0, chanceAgreement = 0;\r\n for (int i = 0; i < m_ConfusionMatrix.length; i++) {\r\n chanceAgreement += (sumRows[i] * sumColumns[i]);\r\n correct += m_ConfusionMatrix[i][i];\r\n }\r\n chanceAgreement /= (sumOfWeights * sumOfWeights);\r\n correct /= sumOfWeights;\r\n\r\n if (chanceAgreement < 1) {\r\n return (correct - chanceAgreement) / (1 - chanceAgreement);\r\n } else {\r\n return 1;\r\n }\r\n}', '2014-12-31 20:00:00'), +(21, 'kb_relative_information_score', 'EvaluationFunction', '-Inf', 'Inf', '', '1', 'The Kononenko and Bratko Information score, divided by the prior entropy of the class distribution.\r\n\r\nSee:\r\nKononenko, I., Bratko, I.: Information-based evaluation criterion for classi er\'s performance. Machine\r\nLearning 6 (1991) 67-80', '', '2014-12-31 20:00:00'), +(22, 'kohavi_wolpert_bias_squared', 'EvaluationFunction', '', '', '', '0', 'Bias component (squared) of the bias-variance decomposition as defined by Kohavi and Wolpert in:\r\n\r\nR. Kohavi & D. Wolpert (1996), Bias plus variance decomposition for zero-one loss functions, in Proc. of the Thirteenth International Machine Learning Conference (ICML96)\r\n\r\nThis quantity measures how closely\r\nthe learning algorithms average guess over all possible training sets of the given training set size matches the target.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 20:00:00'), +(23, 'kohavi_wolpert_error', 'EvaluationFunction', '', '', '', '0', 'Error rate measured in the bias-variance decomposition as defined by Kohavi and Wolpert in:\r\n\r\nR. Kohavi & D. Wolpert (1996), Bias plus variance decomposition for zero-one loss functions, in Proc. of the Thirteenth International Machine Learning Conference (ICML96)\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 20:00:00'), +(24, 'kohavi_wolpert_sigma_squared', 'EvaluationFunction', '', '', '', '0', 'Intrinsic error component (squared) of the bias-variance decomposition as defined by Kohavi and Wolpert in:\r\n\r\nR. Kohavi and D. Wolpert (1996), Bias plus variance decomposition for zero-one loss functions, in Proc. of the Thirteenth International Machine Learning Conference (ICML96)\r\n\r\nThis quantity is a lower bound on the expected cost of any learning algorithm. It is the expected cost of the Bayes optimal classi fier.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 20:00:00'), +(25, 'kohavi_wolpert_variance', 'EvaluationFunction', '', '', '', '0', 'Variance component of the bias-variance decomposition as defined by Kohavi and Wolpert in:\r\n\r\nR. Kohavi and D. Wolpert (1996), Bias plus variance decomposition for zero-one loss functions, in Proc. of the Thirteenth International Machine Learning Conference (ICML96)\r\n\r\nThis quantity measures how much the\r\nlearning algorithms guess \"bounces around\" for the different training sets of the given size.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 20:00:00'), +(26, 'kononenko_bratko_information_score', 'EvaluationFunction', '-Inf', 'Inf', '', '1', 'Kononenko and Bratko Information score. This measures predictive accuracy but eliminates the influence of prior probabilities.\r\n\r\nSee:\r\nKononenko, I., Bratko, I.: Information-based evaluation criterion for classi er\'s performance. Machine\r\nLearning 6 (1991) 67-80', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(27, 'matthews_correlation_coefficient', 'EvaluationFunction', '-1', '1', '', '1', 'The Matthews correlation coefficient takes into account true and false positives and negatives and is generally regarded as a balanced measure which can be used even if the classes are of very different sizes. The MCC is in essence a correlation coefficient between the observed and predicted binary classifications; it returns a value between ?1 and +1. A coefficient of +1 represents a perfect prediction, 0 no better than random prediction and ?1 indicates total disagreement between prediction and observation. The statistic is also known as the phi coefficient. MCC is related to the chi-square statistic for a 2×2 contingency table.\r\n\r\nThe MCC can be calculated directly from the confusion matrix using the formula:\r\n\r\n\r\n\\text{MCC} = \\frac{ TP \\times TN - FP \\times FN } {\\sqrt{ (TP + FP) ( TP + FN ) ( TN + FP ) ( TN + FN ) } }\r\n\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Matthews_correlation_coefficient\r\n', '', '2014-12-31 20:00:00'), +(28, 'mean_absolute_error', 'EvaluationFunction', '0', '1', '', '0', 'The mean absolute error (MAE) measures how close the model\'s predictions are to the actual target values. It is the sum of the absolute value of the difference of each instance prediction and the actual value. For classification, the 0/1-error is used.\r\n\r\n\\mathrm{MAE} = \\frac{1}{n}\\sum_{i=1}^n \\left| f_i-y_i\\right| =\\frac{1}{n}\\sum_{i=1}^n \\left| e_i \\right|.\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Mean_absolute_error', 'See WEKA\'s Evaluation class\r\n\r\n', '2014-12-31 20:00:00'), +(29, 'mean_class_complexity', 'EvaluationFunction', '0', 'Inf', '', '1', 'The entropy of the class distribution generated by the model (see class_complexity), divided by the number of instances in the input data.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(30, 'mean_class_complexity_gain', 'EvaluationFunction', '-Inf', 'Inf', '', '1', 'The entropy gain of the class distribution by the model over the prior distribution (see class_complexity_gain), divided by the number of instances in the input data.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(31, 'mean_f_measure', 'EvaluationFunction', '0', '1', '', '1', 'Unweighted(!) macro-average F-Measure. \r\n\r\nIn macro-averaging, F-measure is computed\r\nlocally over each category ?rst and then the average over all categories is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(32, 'mean_kononenko_bratko_information_score', 'EvaluationFunction', '-Inf', 'Inf', '', '1', 'Kononenko and Bratko Information score, see kononenko_bratko_information_score, divided by the number of instances in the input data.\r\n\r\nSee:\r\nKononenko, I., Bratko, I.: Information-based evaluation criterion for classi er\'s performance. Machine\r\nLearning 6 (1991) 67-80', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(33, 'mean_precision', 'EvaluationFunction', '0', '1', '', '1', 'Unweighted(!) macro-average Precision. \r\n\r\nIn macro-averaging, Precision is computed\r\nlocally over each category ?rst and then the average over all categories is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(34, 'mean_prior_absolute_error', 'EvaluationFunction', '0', '1', '', '0', 'The mean prior absolute error (MPAE) is the mean absolute error (see mean_absolute_error) of the prior (e.g., default class prediction).\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Mean_absolute_error', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(35, 'mean_prior_class_complexity', 'EvaluationFunction', '0', 'Inf', '', '1', 'The entropy of the class distribution of the prior (see prior_class_complexity), divided by the number of instances in the input data.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(36, 'mean_recall', 'EvaluationFunction', '0', '1', '', '1', 'Unweighted(!) macro-average Recall. \r\n\r\nIn macro-averaging, Recall is computed\r\nlocally over each category ?rst and then the average over all categories is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(37, 'mean_weighted_area_under_roc_curve', 'EvaluationFunction', '0', '1', '', '1', 'The macro weighted (by class size) average area_under_ROC_curve (AUROC). \r\n\r\nIn macro-averaging, AUROC is computed\r\nlocally over each category ?rst and then the average over all categories is taken, weighted by the number of instances of that class.\r\n\r\nConversely, in micro-averaging, AUROC is computed globally over all category decisions.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(38, 'mean_weighted_f_measure', 'EvaluationFunction', '0', '1', '', '1', 'The macro weighted (by class size) average F-Measure. \r\n\r\nIn macro-averaging, F-measure is computed\r\nlocally over each category ?rst and then the average over all categories is taken, weighted by the number of instances of that class.\r\n\r\nConversely, in micro-averaging, F-measure is computed globally over all category decisions.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(39, 'mean_weighted_precision', 'EvaluationFunction', '0', '1', '', '1', 'The macro weighted (by class size) average Precision. \r\n\r\nIn macro-averaging, Precision is computed\r\nlocally over each category ?rst and then the average over all categories is taken, weighted by the number of instances of that class.\r\n\r\nConversely, in micro-averaging, Precision is computed globally over all category decisions.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(40, 'weighted_recall', 'EvaluationFunction', '0', '1', '', '1', 'The macro weighted (by class size) average Recall. \r\n\r\nIn macro-averaging, Recall is computed\r\nlocally over each category ?rst and then the average over all categories is taken, weighted by the number of instances of that class.\r\n\r\nConversely, in micro-averaging, Recall is computed globally over all category decisions.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(41, 'number_of_instances', 'EvaluationFunction', '0', 'inf', '', NULL, 'The number of instances used for this evaluation. ', '', '2014-12-31 20:00:00'), +(42, 'os_information', 'EvaluationFunction', '', '', '', '', 'Default information about OS, JVM, installations, etc. ', '', '2014-12-31 20:00:00'), +(43, 'positives', 'EvaluationFunction', '', '', '', NULL, 'The amount of positives in the subgroup', '', '2016-06-30 07:43:24'), +(44, 'precision', 'EvaluationFunction', '0', '0', '', '1', 'Precision is defined as the number of true positive (TP) predictions, divided by the sum of the number of true positives and false positives (TP+FP):\r\n\r\n\\text{Precision}=\\frac{tp}{tp+fp} \\, \r\n\r\nIt is also referred to as the Positive predictive value (PPV).\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Precision_and_recall\r\n\r\nPrecision is defined only for a specific class value, and should thus be labeled with the class value for which is was computed. Use the mean_weighted_precision for the weighted average over all class values.', 'WEKA\'s Evaluation.precision(int classIndex)\r\n\r\n /**\r\n * Calculate the precision with respect to a particular class. \r\n * This is defined as

\r\n *

\r\n   * correctly classified positives\r\n   * ------------------------------\r\n   *  total predicted as positive\r\n   * 
\r\n *\r\n * @param classIndex the index of the class to consider as \"positive\"\r\n * @return the precision\r\n */\r\n public double precision(int classIndex) {\r\n\r\n double correct = 0, total = 0;\r\n for (int i = 0; i < m_NumClasses; i++) {\r\n if (i == classIndex) {\r\n correct += m_ConfusionMatrix[i][classIndex];\r\n }\r\n total += m_ConfusionMatrix[i][classIndex];\r\n }\r\n if (total == 0) {\r\n return 0;\r\n }\r\n return correct / total;\r\n}', '2014-12-31 20:00:00'), +(45, 'predictive_accuracy', 'EvaluationFunction', '0', '1', '', '1', 'The Predictive Accuracy is the percentage of instances that are classified correctly. Is it 1 - ErrorRate.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(46, 'prior_class_complexity', 'EvaluationFunction', '0', 'Inf', 'bits', '1', 'Entropy, in bits, of the prior class distribution. Calculated by taking the sum of -log2(priorProb) over all instances, where priorProb is the prior probability of the actual class for that instance. If instances are weighted, the weighted sum is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(47, 'prior_entropy', 'EvaluationFunction', '0', 'Inf', 'bits', '1', 'Entropy, in bits, of the prior class distribution. Calculated by taking the sum of -log2(priorProb) over all instances, where priorProb is the prior probability of the actual class for that instance. If instances are weighted, the weighted sum is taken.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(48, 'probability', 'EvaluationFunction', '', '', '', NULL, 'The probability of a subgroup.', '', '2016-06-30 07:43:24'), +(49, 'quality', 'EvaluationFunction', '', '', '', 'true', 'The quality of the founded subgroup', '', '2016-06-30 07:43:24'), +(50, 'ram_hours', 'EvaluationFunction', '0', 'Inf', 'GB RAM x hours', '0', 'Every GB of RAM deployed for 1 hour equals one RAM-Hour.', '', '2014-12-31 20:00:00'), +(51, 'recall', 'EvaluationFunction', '0', '0', '', '1', 'Recall is defined as the number of true positive (TP) predictions, divided by the sum of the number of true positives and false negatives (TP+FN):\r\n\r\n\\text{Recall}=\\frac{tp}{tp+fn} \\, \r\n\r\nIt is also referred to as the True Positive Rate (TPR) or Sensitivity.\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Precision_and_recall\r\n\r\nRecall is defined only for a specific class value, and should thus be labeled with the class value for which is was computed. Use the mean_weighted_recall for the weighted average over all class values.', 'WEKA\'s Evaluation.truePositiveRate(int classIndex):\r\n\r\n /**\r\n * Calculate the true positive rate with respect to a particular class. \r\n * This is defined as

\r\n *

\r\n   * correctly classified positives\r\n   * ------------------------------\r\n   *       total positives\r\n   * 
\r\n *\r\n * @param classIndex the index of the class to consider as \"positive\"\r\n * @return the true positive rate\r\n */\r\n public double truePositiveRate(int classIndex) {\r\n\r\n double correct = 0, total = 0;\r\n for (int j = 0; j < m_NumClasses; j++) {\r\n if (j == classIndex) {\r\n correct += m_ConfusionMatrix[classIndex][j];\r\n }\r\n total += m_ConfusionMatrix[classIndex][j];\r\n }\r\n if (total == 0) {\r\n return 0;\r\n }\r\n return correct / total;\r\n}', '2014-12-31 20:00:00'), +(52, 'relative_absolute_error', 'EvaluationFunction', '0', '1', '', '0', 'The Relative Absolute Error (RAE) is the mean absolute error (MAE) divided by the mean prior absolute error (MPAE).', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(53, 'root_mean_prior_squared_error', 'EvaluationFunction', '0', '1', '', '0', 'The Root Mean Prior Squared Error (RMPSE) is the Root Mean Squared Error (RMSE) of the prior (e.g., the default class prediction).', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(54, 'root_mean_squared_error', 'EvaluationFunction', '0', '1', '', '0', 'The Root Mean Squared Error (RMSE) measures how close the model\'s predictions are to the actual target values. It is the square root of the Mean Squared Error (MSE), the sum of the squared differences between the predicted value and the actual value. For classification, the 0/1-error is used.\r\n\r\n:\\operatorname{MSE}(\\overline{X})=\\operatorname{E}((\\overline{X}-\\mu)^2)=\\left(\\frac{\\sigma}{\\sqrt{n}}\\right)^2= \\frac{\\sigma^2}{n}\r\n\r\nSee:\r\nhttp://en.wikipedia.org/wiki/Mean_squared_error', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(55, 'root_relative_squared_error', 'EvaluationFunction', '0', '1', '', '0', 'The Root Relative Squared Error (RRSE) is the Root Mean Squared Error (RMSE) divided by the Root Mean Prior Squared Error (RMPSE). See root_mean_squared_error and root_mean_prior_squared_error.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(56, 'run_cpu_time', 'EvaluationFunction', '0', 'Inf', 'seconds', '0', 'Runtime in seconds of the entire run. In the case of cross-validation runs, this will include all iterations.', '', '2014-12-31 20:00:00'), +(57, 'run_memory', 'EvaluationFunction', '0', 'Inf', 'bytes', '0', 'Amount of memory, in bytes, used during the entire run.', '', '2014-12-31 20:00:00'), +(58, 'run_virtual_memory', 'EvaluationFunction', '0', 'Inf', 'bytes', '0', 'Amount of virtual memory, in bytes, used during the entire run.', '', '2014-12-31 20:00:00'), +(59, 'scimark_benchmark', 'EvaluationFunction', '0', 'Inf', 'MFlops', '1', 'A benchmark tool which measures (single core) CPU performance on the JVM. ', 'See http://math.nist.gov/scimark2/', '2014-12-31 20:00:00'), +(60, 'single_point_area_under_roc_curve', 'EvaluationFunction', '0', '1', '', '1', NULL, '', '2014-12-31 20:00:00'), +(61, 'total_cost', 'EvaluationFunction', '-Inf', 'Inf', '', '0', NULL, '', '2014-12-31 20:00:00'), +(62, 'unclassified_instance_count', 'EvaluationFunction', '0', 'Inf', 'instances', '1', 'Number of instances that were not classified by the model.', 'See WEKA\'s Evaluation class\r\n', '2014-12-31 20:00:00'), +(63, 'usercpu_time_millis', 'EvaluationFunction', '0', 'Inf', 'milliseconds', '0', 'The time in milliseconds to build and test a single model on all data.', '', '2014-12-31 20:00:00'), +(64, 'usercpu_time_millis_testing', 'EvaluationFunction', '0', 'Inf', 'milliseconds', '0', 'The time in milliseconds to test a single model on all data.', '', '2014-12-31 20:00:00'), +(65, 'usercpu_time_millis_training', 'EvaluationFunction', '0', 'Inf', 'milliseconds', '0', 'The time in milliseconds to build a single model on all data.', '', '2014-12-31 20:00:00'), +(66, 'webb_bias', 'EvaluationFunction', '', '', '', '0', 'Bias component (squared) of the bias-variance decomposition as defined by Webb in:\r\n\r\nGeoffrey I. Webb (2000), MultiBoosting: A Technique for Combining Boosting and Wagging, Machine Learning, 40(2), pages 159-196.\r\n\r\nThis quantity measures how closely\r\nthe learning algorithms average guess over all possible training sets of the given training set size matches the target.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 20:00:00'), +(67, 'webb_error', 'EvaluationFunction', '', '', '', '0', 'Intrinsic error component (squared) of the bias-variance decomposition as defined by Webb in:\r\n\r\nGeoffrey I. Webb (2000), MultiBoosting: A Technique for Combining Boosting and Wagging, Machine Learning, 40(2), pages 159-196.\r\n\r\nThis quantity is a lower bound on the expected cost of any learning algorithm. It is the expected cost of the Bayes optimal classi fier.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 20:00:00'), +(68, 'webb_variance', 'EvaluationFunction', '', '', '', '0', 'Variance component of the bias-variance decomposition as defined by Webb in:\r\n\r\nGeoffrey I. Webb (2000), MultiBoosting: A Technique for Combining Boosting and Wagging, Machine Learning, 40(2), pages 159-196.\r\n\r\nThis quantity measures how much the\r\nlearning algorithms guess \"bounces around\" for the different training sets of the given size.\r\n\r\nEstimated using the classifier using the sub-sampled cross-validation procedure as specified in:\r\n\r\nGeoffrey I. Webb & Paul Conilione (2002), Estimating bias and variance from data , School of Computer Science and Software Engineering, Monash University, Australia', 'See WEKA\'s BVDecompose class', '2014-12-31 20:00:00'), +(69, 'joint_entropy', 'EvaluationFunction', '', '', '', NULL, 'Subgroup discovery measure.', '', '2016-06-30 07:43:24'), +(70, 'pattern_team_auroc10', 'EvaluationFunction', '', '', '', NULL, 'Area under the ROC curve for the 10 best subgroups', '', '2016-06-30 07:43:24'), +(71, 'wall_clock_time_millis', 'EvaluationFunction', '0', 'inf', 'millisecond', 'False', 'The number of milliseconds from the start of training until the completion of testing. Thus, involves both training and testing. Does not take into account the number of cores. ', '', '2018-08-15 14:26:51'), +(72, 'wall_clock_time_millis_training', 'EvaluationFunction', '0', 'Inf', 'millisecond', 'False', 'The number of milliseconds from the start of training until the completion of training. Does not take into account the number of cores. ', '', '2018-08-15 14:26:51'), +(73, 'wall_clock_time_millis_testing', 'EvaluationFunction', '0', 'Inf', 'millisecond', 'False', 'The number of milliseconds from the start of testing until the completion of testing. Does not take into account the number of cores. ', '', '2018-08-15 14:26:51'), +(74, 'unweighted_recall', 'EvaluationFunction', '0', '1', '', '1', 'The macro unweighted (ignoring class size) average Recall. \r\n\r\nIn macro-averaging, Recall is computed\r\nlocally over each category ?rst and then the average over all categories is taken, weighted by the number of instances of that class.\r\n\r\nConversely, in micro-averaging, Recall is computed globally over all category decisions.', 'See WEKA\'s Evaluation class\r\n', '2019-10-28 15:35:10'); \ No newline at end of file diff --git a/data/sql/task_type.sql b/data/sql/task_type.sql index c1d5c8f55..55055b462 100644 --- a/data/sql/task_type.sql +++ b/data/sql/task_type.sql @@ -3,7 +3,8 @@ INSERT INTO `task_type` (`ttid`, `name`, `description`, `creator`, `contributors (2, 'Supervised Regression', 'Given a dataset with a numeric target and a set of train/test splits, e.g. generated by a cross-validation procedure, train a model and return the predictions of that model.', 'Joaquin Vanschoren, Jan van Rijn, Luis Torgo, Bernd Bischl', 'Bo Gao, Simon Fischer, Venkatesh Umaashankar, Michael Berthold, Bernd Wiswedel ,Patrick Winter', '2013-02-13 00:00:00'), (3, 'Learning Curve', 'Given a dataset with a nominal target, various data samples of increasing size are defined. A model is build for each individual data sample; from this a learning curve can be drawn. ', 'Pavel Brazdil, Jan van Rijn, Joaquin Vanschoren', NULL, '2014-01-21 00:00:00'), (4, 'Supervised Data Stream Classification', 'Given a dataset with a nominal target, various data samples of increasing size are defined. A model is build for each individual data sample; from this a learning curve can be drawn.', 'Geoffrey Holmes, Bernhard Pfahringer, Jan van Rijn, Joaquin Vanschoren', NULL, '2014-03-01 00:00:00'), -(5, 'Clustering', 'Given an input dataset, the task is to partition it into various clusters.', '"Mehdi Jamali", "Jan van Rijn", "Nenad Tomasev", "Joaquin Vanschoren"', NULL, '2014-10-24 00:00:00'), -(6, 'Machine Learning Challenge', 'This is a standard machine learning challenge with a hidden private dataset.\r\nIt offers a labeled training set and an unlabeled test set. \r\n\r\nThe task is to label the unlabeled instances. Only the OpenML server knows the correct labels, and will evaluate the submitted predictions using these hidden labels. The evaluation procedure, measure, and cost function (if any) are provided.', '"Jan van Rijn","Joaquin Vanschoren"', NULL, '2014-11-28 00:00:00'), -(7, 'Survival Analysis', 'Related to Regression. Given a dataset (typically consisting of patient data) predict a left timestamp (date entering the study), right timestamp (date of leaving the study), or both. ', '"Benrd Bischl","Dominik Kirchhoff","Michel Lang","Jan van Rijn","Joaquin Vanschoren"', NULL, '2014-12-03 00:00:00'), -(8, 'Subgroup Discovery', 'Subgroup discovery is a data mining technique which extracts interesting rules with respect to a target variable. An important characteristic of this task is the combination of predictive and descriptive induction. An overview related to the task of subgroup discovery is presented. (description by: Herrera et. al., An overview on subgroup discovery: foundations and applications)', '"Jan N. van Rijn", "Arno Knobbe", "Joaquin Vanschoren"', NULL, '2016-06-17 10:59:20'); +(5, 'Clustering', 'Given an input dataset, the task is to partition it into various clusters.', '\"Mehdi Jamali\", \"Jan van Rijn\", \"Nenad Tomasev\", \"Joaquin Vanschoren\"', NULL, '2014-10-24 00:00:00'), +(6, 'Machine Learning Challenge', 'This is a standard machine learning challenge with a hidden private dataset.\r\nIt offers a labeled training set and an unlabeled test set. \r\n\r\nThe task is to label the unlabeled instances. Only the OpenML server knows the correct labels, and will evaluate the submitted predictions using these hidden labels. The evaluation procedure, measure, and cost function (if any) are provided.', '\"Jan van Rijn\",\"Joaquin Vanschoren\"', NULL, '2014-11-28 00:00:00'), +(7, 'Survival Analysis', 'Related to Regression. Given a dataset (typically consisting of patient data) predict a left timestamp (date entering the study), right timestamp (date of leaving the study), or both. ', '\"Benrd Bischl\",\"Dominik Kirchhoff\",\"Michel Lang\",\"Jan van Rijn\",\"Joaquin Vanschoren\"', NULL, '2014-12-03 00:00:00'), +(8, 'Subgroup Discovery', 'Subgroup discovery is a data mining technique which extracts interesting rules with respect to a target variable. An important characteristic of this task is the combination of predictive and descriptive induction. An overview related to the task of subgroup discovery is presented. (description by: Herrera et. al., An overview on subgroup discovery: foundations and applications)', '\"Jan N. van Rijn\", \"Arno Knobbe\", \"Joaquin Vanschoren\"', NULL, '2016-06-17 10:59:20'), +(9, 'Multitask Regression', '', 'Jan N. van Rijn', NULL, '2019-10-24 23:46:54'); diff --git a/data/sql/task_type_inout.sql b/data/sql/task_type_inout.sql index 79b482e68..891169a2c 100644 --- a/data/sql/task_type_inout.sql +++ b/data/sql/task_type_inout.sql @@ -58,4 +58,7 @@ INSERT INTO `task_type_inout` (`ttid`, `name`, `type`, `io`, `requirement`, `des (8, 'source_data', 'Dataset', 'input', 'required', 'The input data for this task', 10, '{\r\n\"data_type\": \"numeric\",\r\n\"select\": \"did\",\r\n\"from\": \"dataset\"\r\n}', '\r\n[INPUT:source_data]\r\n[INPUT:target_feature]\r\n[INPUT:target_value]\r\n', '{\r\n \"name\": \"Dataset(s)\",\r\n \"autocomplete\": \"commaSeparated\",\r\n \"datasource\": \"expdbDatasetVersion()\",\r\n \"placeholder\": \"(*) include all datasets\"\r\n}'), (8, 'target_feature', 'String', 'input', 'required', 'The name of the dataset feature to be used as the target feature.', 15, '{\r\n\"data_type\": \"string\",\r\n\"select\": \"name\",\r\n\"from\": \"data_feature\",\r\n\"where\": \"did = \\\"[INPUT:source_data]\\\" AND data_type = \\\"nominal\\\"\"\r\n}', NULL, '{\r\n \"placeholder\": \"Use default target\"\r\n}'), (8, 'target_value', 'String', 'input', 'required', 'The value of the target feature to be used as the SD target value.', 15, '{\r\n\"data_type\": \"string\"\r\n}', NULL, '{\r\n \"placeholder\": \"Use default target value\"\r\n}'), -(8, 'time_limit', 'Integer', 'input', 'required', 'The time limit for SD search', 30, '{\r\n\"data_type\": \"numeric\"\r\n}', '[INPUT:time_limit]', 'NULL'); +(8, 'time_limit', 'Integer', 'input', 'required', 'The time limit for SD search', 30, '{\r\n\"data_type\": \"numeric\"\r\n}', '[INPUT:time_limit]', 'NULL'), +(9, 'estimation_procedure', 'Estimation Procedure', 'input', 'required', 'The estimation procedure used to validate the generated models', 20, '{\r\n\"data_type\": \"numeric\",\r\n\"select\": \"id\",\r\n\"from\": \"estimation_procedure\",\r\n\"where\": \"ttid = [TASK:ttid]\"\r\n}', '\r\n[INPUT:estimation_procedure]\r\n[LOOKUP:estimation_procedure.type]\r\n[CONSTANT:base_url]/api_splits/get/[TASK:id]/Task_[TASK:id]_splits.arff\r\n[LOOKUP:estimation_procedure.repeats]\r\n[LOOKUP:estimation_procedure.folds]\r\n[INPUT:number_samples]\r\n', '{\r\n \"type\": \"select\",\r\n \"table\": \"estimation_procedure\",\r\n \"key\": \"id\",\r\n \"value\": \"name\"\r\n}'), +(9, 'source_data_list', 'Dataset', 'input', 'required', 'The input data for this task', 10, '{\r\n\"data_type\": \"json\"\r\n}', '\r\n[INPUT:source_data_list]\r\n[INPUT:target_feature]\r\n', '{\r\n \"name\": \"Dataset(s)\",\r\n \"autocomplete\": \"commaSeparated\",\r\n \"datasource\": \"expdbDatasetVersion()\",\r\n \"placeholder\": \"(*) include all datasets\"\r\n}'), +(9, 'target_feature', 'String', 'input', 'required', 'The name of the dataset feature to be used as the target feature.', 15, '{\r\n\"data_type\": \"string\"\r\n}', NULL, '{\r\n \"default\": \"class\",\r\n \"placeholder\": \"Use default target\"\r\n}'); diff --git a/openml_OS/controllers/Api_splits.php b/openml_OS/controllers/Api_splits.php index cffd075ec..033fa4cb9 100644 --- a/openml_OS/controllers/Api_splits.php +++ b/openml_OS/controllers/Api_splits.php @@ -17,7 +17,7 @@ function __construct() { $this->load->helper('file_upload'); $this->db = $this->load->database('read',true); - $this->task_types = array(1, 2, 3, 6, 7); + $this->task_types = array(1, 2, 3, 6, 7, 9); $this->challenge_types = array(9); $this->evaluation = APPPATH . 'third_party/OpenML/Java/evaluate.jar'; $this->eval_engine_config = " -config 'cache_allowed=false;server=".BASE_URL.";api_key=".API_KEY."' "; @@ -105,13 +105,27 @@ function challenge($task_id, $testtrain, $offset_arg, $size_arg) { } } + function merge_datasets($task_id) { + $dir_idx = floor($task_id / $this->content_folder_modulo) * $this->content_folder_modulo; + $directory = $this->directory . '/' . $dir_idx . '/' . $task_id; + + $filepath = $directory . '/merged_dataset.arff'; + if (file_exists($filepath) == false) { + $this->generate("merge_datasets", $task_id, $filepath); + } + + header('Content-type: text/plain'); + header('Content-Length: ' . filesize($filepath)); + readfile_chunked($filepath); + } + function get($task_id) { $dir_idx = floor($task_id / $this->content_folder_modulo) * $this->content_folder_modulo; $directory = $this->directory . '/' . $dir_idx . '/' . $task_id; $filepath = $directory . '/splits.arff'; if (file_exists($filepath) == false) { - $this->generate($task_id, $filepath); + $this->generate("generate_folds", $task_id, $filepath); } header('Content-type: text/plain'); @@ -119,7 +133,7 @@ function get($task_id) { readfile_chunked($filepath); } - private function generate($task_id, $filepath) { + private function generate($function, $task_id, $filepath) { $task = $this->Task->getById($task_id); if ($task === false || in_array($task->ttid, $this->task_types) === false) { http_response_code($this->config->item('general_http_error_code')); @@ -130,7 +144,7 @@ private function generate($task_id, $filepath) { // TODO: very important. sanity check input $testset_str = array_key_exists('custom_testset', $values) && is_cs_natural_numbers($values['custom_testset']) ? '-test "' . $values['custom_testset'] . '"' : ''; - $command = 'java -jar ' . $this->evaluation . ' -f "generate_folds" -id ' . $task_id . ' ' . $this->eval_engine_config; + $command = 'java -jar ' . $this->evaluation . ' -f "' . $function . '" -id ' . $task_id . ' ' . $this->eval_engine_config; if (array_key_exists('custom_testset', $values)) { $command .= '-test "' . $values['custom_testset'] . '" '; @@ -142,9 +156,6 @@ private function generate($task_id, $filepath) { $command .= ' -o ' . $filepath; - //if( $md5 ) $command .= ' -m'; - $this->Log->cmd('API Splits::get(' . $task_id . ')', $command); - if (function_enabled('exec')) { header('Content-type: text/plain'); $result_status = 0; @@ -155,20 +166,20 @@ private function generate($task_id, $filepath) { if ($return_status != 0 && defined('EMAIL_API_LOG')) { $to = EMAIL_API_LOG; - $subject = 'OpenML API Split Generation Exception: ' . $result_status; + $subject = 'OpenML API [' . $function . '] Exception: ' . $result_status; $content = 'Time: ' . now() . "\nTask_id:" . $task_id . "\nOutput: " . implode("\n", $result); sendEmail($to, $subject, $content, 'text'); http_response_code($this->config->item('general_http_error_code')); - die('failed to generate arff file. Evaluation Engine result send to EMAIL_API_LOG account.'); + die('failed to perform action ' . $function . '. Evaluation Engine result send to EMAIL_API_LOG account.'); } if ($return_status != 0) { http_response_code($this->config->item('general_http_error_code')); - die('failed to generate arff file. Evaluation Engine result omitted (no EMAIL_API_LOG defined). '); + die('failed to perform action ' . $function . '. Evaluation Engine result omitted (no EMAIL_API_LOG defined). '); } } else { http_response_code($this->config->item('general_http_error_code')); - die('failed to generate arff file: php "exec" function disabled. '); + die('failed to perform action ' . $function . ': php "exec" function disabled. '); } } } diff --git a/openml_OS/libraries/ElasticSearch.php b/openml_OS/libraries/ElasticSearch.php index 270619996..1e1b47f9b 100644 --- a/openml_OS/libraries/ElasticSearch.php +++ b/openml_OS/libraries/ElasticSearch.php @@ -761,7 +761,7 @@ private function build_study($d) { 'uploader_id' => $d->creator, 'uploader' => array_key_exists($d->creator, $this->user_names) ? $this->user_names[$d->creator] : 'Unknown', 'visibility' => $d->visibility, - 'type' => $d->main_entity_type, + 'study_type' => $d->main_entity_type, 'legacy' => $d->legacy, 'suggest' => array( 'input' => array($d->name, $d->description . ' '), @@ -890,7 +890,7 @@ private function build_task($d) { $did = 0; if ($task) { foreach ($task as $t) { - if ($t->type == 'Dataset') { + if ($t->input == 'source_data') { $description[] = $this->data_names[$t->value]; $newdata[$t->input] = array( 'type' => $t->type, @@ -898,7 +898,7 @@ private function build_task($d) { 'name' => $this->data_names[$t->value] ); $did = $t->value; - } else if ($t->type == 'Estimation Procedure') { + } else if ($t->input == 'estimation_procedure') { $description[] = $this->procedure_names[$t->value]; $newdata[$t->input] = array( 'type' => $t->type, @@ -1064,10 +1064,12 @@ private function fetch_classes($id = false) { private function fetch_runfiles($min, $max) { $index = array(); - foreach ($this->db->query('SELECT source, field, name, format, file_id from runfile where source >= ' . $min . ' and source < ' . $max) as $r) { - $index[$r->source][$r->field]['url'] = BASE_URL . 'data/download/' . $r->file_id . '/' . $r->name; - $index[$r->source][$r->field]['format'] = $r->format; - } + $runfiles = $this->db->query('SELECT source, field, name, format, file_id from runfile where source >= ' . $min . ' and source < ' . $max); + if ($runfiles) + foreach ($runfiles as $r) { + $index[$r->source][$r->field]['url'] = BASE_URL . 'data/download/' . $r->file_id . '/' . $r->name; + $index[$r->source][$r->field]['format'] = $r->format; + } return $index; } @@ -1439,16 +1441,16 @@ private function build_task_type($d) { ); $inputs = $this->db->query('SELECT name, type, description, io, requirement FROM task_type_inout where ttid=' . $d->ttid); - - foreach ($inputs as $i) { - $new_data['input'][] = array( - 'name' => $i->name, - 'type' => $i->type, - 'description' => $i->description, - 'io' => $i->io, - 'requirement' => $i->requirement - ); - } + if ($inputs) + foreach ($inputs as $i) { + $new_data['input'][] = array( + 'name' => $i->name, + 'type' => $i->type, + 'description' => $i->description, + 'io' => $i->io, + 'requirement' => $i->requirement + ); + } return $new_data; } @@ -1767,15 +1769,16 @@ public function index_single_dataset($id) { if ($id and ! $datasets) return 'Error: data set ' . $id . ' is unknown'; - foreach ($datasets as $d) { - $params['body'][] = array( - 'index' => array( - '_id' => $d->did - ) - ); + if ($datasets) + foreach ($datasets as $d) { + $params['body'][] = array( + 'index' => array( + '_id' => $d->did + ) + ); - $params['body'][] = $this->build_data($d); - } + $params['body'][] = $this->build_data($d); + } $responses = $this->client->bulk($params); diff --git a/openml_OS/models/Algorithm_setup.php b/openml_OS/models/Algorithm_setup.php index 2e7d22589..35a8a6c97 100644 --- a/openml_OS/models/Algorithm_setup.php +++ b/openml_OS/models/Algorithm_setup.php @@ -138,6 +138,32 @@ function createSetup($implementation, $parameters, $setup_string) { } return $setupId; } + + public function setup_ids_to_parameter_values($setups) { + // query fails for classifiers without parameters. is fixed further on. + $this->db->select('input.*, input_setting.*, `implementation`.`name` AS `flow_name`, `implementation`.`fullName` AS `flow_fullName`')->from('input_setting'); + $this->db->join('input', 'input_setting.input_id = input.id', 'inner'); + $this->db->join('implementation', 'input.implementation_id = implementation.id', 'inner'); + // note that algorithm setup can not be linked to implementation id, otherwise we will only get parameters of the root classifier + $this->db->join('algorithm_setup', 'algorithm_setup.sid = input_setting.setup', 'inner'); + $this->db->join('setup_tag', 'input_setting.setup = setup_tag.id', 'left'); + $this->db->where_in('algorithm_setup.sid', $setups); + + $query = $this->db->get(); + $parameters = $query->result(); + + $per_setup = array(); + // initialize the array + foreach ($setups as $setup) { + $per_setup[$setup] = array(); + } + // now fill with parameters + foreach ($parameters as $parameter) { + $per_setup[$parameter->setup][] = $parameter; + } + + return $per_setup; + } } ?> diff --git a/openml_OS/models/api/v1/Api_data.php b/openml_OS/models/api/v1/Api_data.php index 35e9168cc..c2c52de77 100644 --- a/openml_OS/models/api/v1/Api_data.php +++ b/openml_OS/models/api/v1/Api_data.php @@ -367,6 +367,12 @@ private function data_upload() { // get description from string upload $description = $this->input->post('description', false); if(validateXml($description, $xsdFile, $xmlErrors, false ) == false) { + if (DEBUG) { + $to = $this->user_email; + $subject = 'OpenML Data Upload DEBUG message. '; + $content = "Uploaded POST field \nXSD Validation Message: " . $xmlErrors . "\n=====BEGIN XML=====\n" . $description; + sendEmail($to, $subject, $content,'text'); + } $this->returnError(131, $this->version, $this->openmlGeneralErrorCode, $xmlErrors); return; } @@ -376,11 +382,18 @@ private function data_upload() { $xmlErrors = ''; if (check_uploaded_file($_FILES['description'], false, $uploadError) == false) { $this->returnError(135, $this->version, $this->openmlGeneralErrorCode, $uploadError); + return; } // get description from file upload $description = $_FILES['description']; if (validateXml($description['tmp_name'], $xsdFile, $xmlErrors) == false) { + if (DEBUG) { + $to = $this->user_email; + $subject = 'OpenML Data Upload DEBUG message. '; + $content = 'Filename: ' . $description['name'] . "\nXSD Validation Message: " . $xmlErrors . "\n=====BEGIN XML=====\n" . file_get_contents($description['tmp_name']); + sendEmail($to, $subject, $content,'text'); + } $this->returnError(131, $this->version, $this->openmlGeneralErrorCode, $xmlErrors); return; } @@ -657,17 +670,18 @@ private function data_features_upload() { $this->returnError(442, $this->version); return; } - - // get description from string upload + + // get description from file upload. Note that we will check the XSD later on (after we have assembled fields for error handling) $description = $_FILES['description']; - if (validateXml($description['tmp_name'], xsd('openml.data.features', $this->controller, $this->version), $xmlErrors) == false) { - $this->returnError(443, $this->version, $this->openmlGeneralErrorCode, $xmlErrors); + $xml = simplexml_load_file($description['tmp_name']); + + // precheck XSD (if this pre-check succeeds, we can do database error logging later) + if (!($xml->children('oml', true)->{'did'} && $xml->children('oml', true)->{'evaluation_engine_id'})) { + $this->returnError(443, $this->version, $this->openmlGeneralErrorCode, 'XML misses basic fields did or evaluation_engine_id'); return; } - - $xml = simplexml_load_file($description['tmp_name']); - $did = ''. $xml->children('oml', true)->{'did'}; - $eval_id = ''.$xml->children('oml', true)->{'evaluation_engine_id'}; + $did = ''. $xml->children('oml', true)->{'did'}; // Note that this relies on this field being in the xml + $eval_id = ''.$xml->children('oml', true)->{'evaluation_engine_id'}; // Note that this relies on this field being in the xml if (!is_numeric($did) || !is_numeric($eval_id) || $did <= 0 || $eval_id <= 0) { $this->returnError(446, $this->version); @@ -700,6 +714,19 @@ private function data_features_upload() { if ($xml->children('oml', true)->{'error'}) { $data['error'] = htmlentities($xml->children('oml', true)->{'error'}); } + + if (validateXml($description['tmp_name'], xsd('openml.data.features', $this->controller, $this->version), $xmlErrors) == false) { + $data['error'] = 'XSD does not comply. XSD errors: ' . $xmlErrors; + $success = $this->Data_processed->replace($data); + if (DEBUG) { + $to = $this->user_email; + $subject = 'OpenML Data Features Upload DEBUG message. '; + $content = 'Filename: ' . $description['name'] . "\nXSD Validation Message: " . $xmlErrors . "\n=====BEGIN XML=====\n" . file_get_contents($description['tmp_name']); + sendEmail($to, $subject, $content, 'text'); + } + $this->returnError(443, $this->version, $this->openmlGeneralErrorCode, $xmlErrors); + return; + } $this->db->trans_start(); @@ -800,13 +827,13 @@ private function data_features_upload() { // $data['default_target_attribute'] = $feature->name; //} } - $this->db->trans_complete(); - - if ($success) { - $this->xmlContents('data-features-upload', $this->version, array('did' => $dataset->did)); - } else { + if ($this->db->trans_status() === FALSE) { + $this->db->trans_rollback(); $this->returnError(445, $this->version); return; + } else { + $this->db->trans_commit(); + $this->xmlContents('data-features-upload', $this->version, array('did' => $dataset->did)); } } @@ -1057,7 +1084,15 @@ private function data_qualities_upload() { $result = $this->Data_quality->insert_ignore($data); } } - $this->db->trans_complete(); + + if ($this->db->trans_status() === FALSE) { + $this->db->trans_rollback(); + $this->returnError(389, $this->version); + return; + } else { + $this->db->trans_commit(); + $this->xmlContents('data-qualities-upload', $this->version, array('did' => $did)); + } // add to elastic search index. try { @@ -1067,13 +1102,6 @@ private function data_qualities_upload() { $this->returnError(105, $this->version, $this->openmlGeneralErrorCode, $additionalMsg); return; } - - if ($success) { - $this->xmlContents('data-qualities-upload', $this->version, array('did' => $did)); - } else { - $this->returnError(389, $this->version); - return; - } } private function data_unprocessed($evaluation_engine_id, $order) { diff --git a/openml_OS/models/api/v1/Api_evaluation.php b/openml_OS/models/api/v1/Api_evaluation.php index 26733307d..42e460722 100644 --- a/openml_OS/models/api/v1/Api_evaluation.php +++ b/openml_OS/models/api/v1/Api_evaluation.php @@ -15,9 +15,14 @@ function bootstrap($format, $segments, $request_type, $user_id) { $getpost = array('get','post'); - if (count($segments) >= 1 && $segments[0] == 'list') { + if (count($segments) >= 1 && $segments[0] == 'setup' && $segments[1] == 'list') { array_shift($segments); - $this->evaluation_list($segments, $user_id); + array_shift($segments); + $this->evaluation_list($segments, $user_id, true); + return; + } elseif (count($segments) >= 1 && $segments[0] == 'list') { + array_shift($segments); + $this->evaluation_list($segments, $user_id, false); return; } @@ -89,9 +94,9 @@ private function evaluation_request($evaluation_engine_id, $order, $num_request } - private function evaluation_list($segs, $user_id) { + private function evaluation_list($segs, $user_id, $show_params) { $result_limit = 10000; - $legal_filters = array('task', 'setup', 'flow', 'uploader', 'run', 'tag', 'limit', 'offset', 'function', 'per_fold', 'sort_order'); + $legal_filters = array('task', 'setup', 'flow', 'uploader', 'run', 'tag', 'limit', 'offset', 'function', 'per_fold', 'sort_order', 'study'); list($query_string, $illegal_filters) = $this->parse_filters($segs, $legal_filters); if (count($illegal_filters) > 0) { $this->returnError(544, $this->version, $this->openmlGeneralErrorCode, 'Legal filter operators: ' . implode(',', $legal_filters) .'. Found illegal filter(s): ' . implode(', ', $illegal_filters)); @@ -115,6 +120,7 @@ private function evaluation_list($segs, $user_id) { $offset = element('offset', $query_string, null); $per_fold = element('per_fold', $query_string, null); $sort_order = element('sort_order', $query_string, null); + $study_id = element('study', $query_string, null); if ($per_fold != 'true' && $per_fold != 'false' && $per_fold != null) { $this->returnError(547, $this->version, $this->openmlGeneralErrorCode, 'Filters with illegal values: ' . implode(',', $illegal_filter_inputs)); return; @@ -133,8 +139,16 @@ private function evaluation_list($segs, $user_id) { $this->returnError(549, $this->version); return; } + + if ($study_id) { + $study = $this->Study->getById($study_id); + if ($study === false || $study->legacy != 'n' || $study->main_entity_type != 'run') { + $this->returnError(555, $this->version); + return; + } + } - if ($task_id === null && $setup_id === null && $implementation_id === null && $uploader_id === null && $run_id === null && $tag === null && $limit === null && $function_name === null) { + if ($task_id === null && $setup_id === null && $implementation_id === null && $uploader_id === null && $run_id === null && $tag === null && $study_id === null && $limit === null && $function_name === null) { $this->returnError(540, $this->version); return; } @@ -146,13 +160,14 @@ private function evaluation_list($segs, $user_id) { $where_run = $run_id === null ? '' : ' AND `r`.`rid` IN (' . $run_id . ') '; $where_function = $function_name === null ? '' : ' AND `f`.`name` = "' . $function_name . '" '; $where_tag = $tag === null ? '' : ' AND `r`.`rid` IN (select id from run_tag where tag="' . $tag . '") '; + $where_study = $study_id === null ? '' : ' AND `r`.`rid` IN (SELECT `run_id` FROM `run_study` WHERE `study_id`="' . $study_id . '") '; $where_limit = $limit === null ? null : ' LIMIT ' . $limit; if ($limit && $offset) { $where_limit = ' LIMIT ' . $offset . ',' . $limit; } $where_task_closed = ' AND (`t`.`embargo_end_date` is NULL OR `t`.`embargo_end_date` < NOW() OR `r`.`uploader` = '.$user_id.')'; - $where_runs = $where_task . $where_setup . $where_uploader . $where_impl . $where_run . $where_tag . $where_task_closed; + $where_runs = $where_task . $where_setup . $where_uploader . $where_impl . $where_run . $where_tag . $where_study . $where_task_closed; $where_total = $where_runs . $where_function; //pre-test, should be quick?? @@ -234,7 +249,18 @@ private function evaluation_list($segs, $user_id) { $this->returnError(542, $this->version); return; } - + + if ($show_params) { + # 2 stage query .. unfortunately. Can break when too much results. let's take the damage for now + $setup_ids = array(); + foreach ($res as $r) { + $setup_ids[] = $r->sid; + } + $params = $this->Algorithm_setup->setup_ids_to_parameter_values(array_unique($setup_ids)); + for ($i = 0; $i < count($res); ++$i) { + $res[$i]->parameters = $params[$res[$i]->sid]; + } + } $this->xmlContents('evaluations', $this->version, array('evaluations' => $res)); } } diff --git a/openml_OS/models/api/v1/Api_flow.php b/openml_OS/models/api/v1/Api_flow.php index d570ea3c8..7c1f61f55 100644 --- a/openml_OS/models/api/v1/Api_flow.php +++ b/openml_OS/models/api/v1/Api_flow.php @@ -302,12 +302,12 @@ private function flow_delete($flow_id) { $implementation = $this->Implementation->getById($flow_id); if($implementation == false) { $this->returnError(322, $this->version); - return; + return false; } if($implementation->uploader != $this->user_id && $this->user_has_admin_rights == false) { $this->returnError(323, $this->version); - return; + return false; } $runs = $this->Run->getRunsByFlowId($implementation->id, null, null, 100); @@ -318,24 +318,24 @@ private function flow_delete($flow_id) { $ids[] = $r->id; } $this->returnError(324, $this->version, $this->openmlGeneralErrorCode, '{'. implode(', ', $ids) .'} ()'); - return; + return false; } if ($this->Implementation->isComponent($implementation->id)) { $parent_ids = $this->Implementation_component->getColumnWhere('parent', 'child = "'.$implementation->id.'"'); $this->returnError(328, $this->version, $this->openmlGeneralErrorCode, '{' . implode(', ', $parent_ids) . '}'); - return; + return false; } $remove_input_setting = $this->Input_setting->deleteWhere('setup IN (SELECT sid FROM algorithm_setup WHERE implementation_id = '.$implementation->id.')'); if (!$remove_input_setting) { $this->returnError(326, $this->version); - return; + return false; } $remove_setups = $this->Algorithm_setup->deleteWhere('implementation_id = ' . $implementation->id); if (!$remove_setups) { $this->returnError(327, $this->version); - return; + return false; } $this->Input->deleteWhere('implementation_id =' . $implementation->id); // should be handled by constraints .. @@ -348,7 +348,7 @@ private function flow_delete($flow_id) { if($result == false) { $this->returnError(325, $this->version); - return; + return false; } try { @@ -357,10 +357,11 @@ private function flow_delete($flow_id) { } catch (Exception $e) { $additionalMsg = get_class() . '.' . __FUNCTION__ . ':' . $e->getMessage(); $this->returnError(105, $this->version, $this->openmlGeneralErrorCode, $additionalMsg); - return; + return true; // don't care about ES errors, as they pop up anyway } $this->xmlContents('implementation-delete', $this->version, array('implementation' => $implementation)); + return true; } private function flow_forcedelete($flow_id) { @@ -379,15 +380,20 @@ private function flow_forcedelete($flow_id) { 'run' => 'DELETE FROM run WHERE setup IN (SELECT sid FROM algorithm_setup WHERE implementation_id = '.$flow_id.');', 'algorithm_setup' => 'DELETE FROM algorithm_setup WHERE implementation_id = ' . $flow_id . ';' ); - + + $this->db->trans_start(); foreach ($queries as $table => $query) { - $res = $this->Implementation->query($query); - if ($res == false) { - $this->returnError(551, $this->version, $this->openmlGeneralErrorCode, 'In query table: ' . $table); - return; - } + $this->Implementation->query($query); } - + + if ($this->db->trans_status() === FALSE) { + $this->db->trans_rollback(); + $this->returnError(551, $this->version); + return; + } + $this->db->trans_commit(); + + // now delete the actual flow. This will trigger an XML template $this->flow_delete($flow_id); } diff --git a/openml_OS/models/api/v1/Api_run.php b/openml_OS/models/api/v1/Api_run.php index 4676853c8..bf014e4f8 100644 --- a/openml_OS/models/api/v1/Api_run.php +++ b/openml_OS/models/api/v1/Api_run.php @@ -104,7 +104,7 @@ function bootstrap($format, $segments, $request_type, $user_id) { private function run_list($segs, $user_id) { $result_limit = 10000; - $legal_filters = array('task', 'setup', 'flow', 'uploader', 'run', 'tag', 'limit', 'offset', 'task_type', 'show_errors'); + $legal_filters = array('task', 'setup', 'flow', 'uploader', 'run', 'tag', 'limit', 'offset', 'task_type', 'study', 'show_errors'); list($query_string, $illegal_filters) = $this->parse_filters($segs, $legal_filters); if (count($illegal_filters) > 0) { @@ -127,6 +127,7 @@ private function run_list($segs, $user_id) { $tag = element('tag',$query_string, null); $limit = element('limit',$query_string, null); $offset = element('offset',$query_string, null); + $study_id = element('study', $query_string, null); $show_errors = element('show_errors',$query_string, null); if ($offset && !$limit) { @@ -138,7 +139,15 @@ private function run_list($segs, $user_id) { return; } - if ($task_id === null && $task_type_id === null && $setup_id === null && $implementation_id === null && $uploader_id === null && $run_id === null && $tag === null && $limit === null) { + if ($study_id) { + $study = $this->Study->getById($study_id); + if ($study === false || $study->legacy != 'n' || $study->main_entity_type != 'run') { + $this->returnError(517, $this->version); + return; + } + } + + if ($task_id === null && $task_type_id === null && $setup_id === null && $implementation_id === null && $uploader_id === null && $run_id === null && $tag === null && $limit === null && $study_id === null) { $this->returnError(510, $this->version); return; } @@ -150,6 +159,7 @@ private function run_list($segs, $user_id) { $where_impl = $implementation_id === null ? '' : ' AND `i`.`id` IN (' . $implementation_id . ') '; $where_run = $run_id === null ? '' : ' AND `r`.`rid` IN (' . $run_id . ') '; $where_tag = $tag === null ? '' : ' AND `r`.`rid` IN (select id from run_tag where tag="' . $tag . '") '; + $where_study = $study_id === null ? '' : ' AND `r`.`rid` IN (SELECT `run_id` FROM `run_study` WHERE `study_id`="' . $study_id . '") '; // TODO: runs with errors are always removed? $where_server_error = ' AND `e`.`error` IS NULL '; if (strtolower($show_errors) == 'true') { @@ -163,7 +173,7 @@ private function run_list($segs, $user_id) { $where_limit = ' LIMIT ' . $offset . ', ' . $limit; } - $where_total = $where_task . $where_task_type . $where_setup . $where_uploader . $where_impl . $where_run . $where_tag . $where_server_error . $where_task_closed; + $where_total = $where_task . $where_task_type . $where_setup . $where_uploader . $where_impl . $where_run . $where_tag . $where_study . $where_server_error . $where_task_closed; $sql = 'SELECT r.rid, r.uploader, r.task_id, r.start_time, t.ttid, d.did AS dataset_id, d.name AS dataset_name,' . @@ -232,30 +242,26 @@ private function run_delete($run_id) { $this->returnError( 393, $this->version ); return; } + + $this->db->trans_start(); + + $this->Input_data->deleteWhere( 'run =' . $run->rid ); + $this->Output_data->deleteWhere( 'run =' . $run->rid ); + + $additional_sql = ''; //' AND `did` NOT IN (SELECT `data` FROM `input_data` UNION SELECT `data` FROM `output_data`)'; + $this->Runfile->deleteWhere('`source` = "' . $run->rid . '" ' . $additional_sql); + $this->Evaluation->deleteWhere('`source` = "' . $run->rid. '" ' . $additional_sql); + $this->Evaluation_fold->deleteWhere('`source` = "' . $run->rid . '" ' . $additional_sql); + $this->Evaluation_sample->deleteWhere('`source` = "' . $run->rid . '" ' . $additional_sql); + $this->Run_evaluated->deleteWhere('`run_id` = "' . $run->rid . '" '); + $this->Run->delete( $run->rid ); - $result = true; - $result = $result && $this->Input_data->deleteWhere( 'run =' . $run->rid ); - $result = $result && $this->Output_data->deleteWhere( 'run =' . $run->rid ); - - if( $result ) { - $additional_sql = ''; //' AND `did` NOT IN (SELECT `data` FROM `input_data` UNION SELECT `data` FROM `output_data`)'; - $result = $result && $this->Runfile->deleteWhere('`source` = "' . $run->rid . '" ' . $additional_sql); - $result = $result && $this->Evaluation->deleteWhere('`source` = "' . $run->rid. '" ' . $additional_sql); - $result = $result && $this->Evaluation_fold->deleteWhere('`source` = "' . $run->rid . '" ' . $additional_sql); - $result = $result && $this->Evaluation_sample->deleteWhere('`source` = "' . $run->rid . '" ' . $additional_sql); - $result = $result && $this->Run_evaluated->deleteWhere('`run_id` = "' . $run->rid . '" '); - // Not needed - //$this->Dataset->deleteWhere('`source` = "' . $run->rid . '" ' . $additional_sql); - } - - if( $result ) { - $result = $result && $this->Run->delete( $run->rid ); - } - - if( $result == false ) { + if ($this->db->trans_status() === FALSE) { + $this->db->trans_rollback(); $this->returnError( 394, $this->version ); return; } + $this->db->trans_commit(); try { $this->elasticsearch->delete('run', $run_id); @@ -542,6 +548,7 @@ private function run_upload() { $did = $this->Runfile->insert($record); if( $did == false ) { + $this->db->trans_rollback(); $this->returnError(212, $this->version); return; } @@ -554,10 +561,13 @@ private function run_upload() { $errorCode = 211; return false; } - $this->db->trans_complete(); - if ($this->db->trans_status() === FALSE) { + + if ($this->db->trans_status() === FALSE) { + $this->db->trans_rollback(); $this->returnError(224, $this->version); return; + } else { + $this->db->trans_commit(); } $timestamps[] = microtime(true); // profiling 3 @@ -583,13 +593,9 @@ private function run_upload() { // tag it, if neccessary foreach($tags as $tag) { $success = $this->entity_tag_untag('run', $runId, $tag, false, 'run', true); - // if tagging went wrong, an error is displayed. (TODO: something else?) - if (!$success) return; + // on failure, we ignore it (just a tag) } - // remove scheduled task - $this->Schedule->deleteWhere( 'task_id = "' . $task_id . '" AND sid = "' . $setupId . '"' ); - // and present result, in effect only a run_id. $this->xmlContents( 'run-upload', $this->version, $result ); } @@ -648,10 +654,13 @@ private function run_trace_upload() { $this->Trace->insert($iteration); } - $this->db->trans_complete(); + if ($this->db->trans_status() === FALSE) { + $this->db->trans_rollback(); $this->returnError(564, $this->version); return; + } else { + $this->db->trans_commit(); } $this->xmlContents('run-trace', $this->version, array('run_id' => $run_id)); @@ -836,10 +845,13 @@ private function run_evaluate() { $this->Evaluation->insert($evaluation); } } - $this->db->trans_complete(); + if ($this->db->trans_status() === FALSE) { + $this->db->trans_rollback(); $this->returnError(428, $this->version); return; + } else { + $this->db->trans_commit(); } diff --git a/openml_OS/models/api/v1/Api_setup.php b/openml_OS/models/api/v1/Api_setup.php index 1e94d2518..ca66360d1 100644 --- a/openml_OS/models/api/v1/Api_setup.php +++ b/openml_OS/models/api/v1/Api_setup.php @@ -106,32 +106,6 @@ private function setup($setup_id) { } } - private function _setup_ids_to_parameter_values($setups) { - // query fails for classifiers without parameters. is fixed further on. - $this->db->select('input.*, input_setting.*, `implementation`.`name` AS `flow_name`, `implementation`.`fullName` AS `flow_fullName`')->from('input_setting'); - $this->db->join('input', 'input_setting.input_id = input.id', 'inner'); - $this->db->join('implementation', 'input.implementation_id = implementation.id', 'inner'); - // note that algorithm setup can not be linked to implementation id, otherwise we will only get parameters of the root classifier - $this->db->join('algorithm_setup', 'algorithm_setup.sid = input_setting.setup', 'inner'); - $this->db->join('setup_tag', 'input_setting.setup = setup_tag.id', 'left'); - $this->db->where_in('algorithm_setup.sid', $setups); - - $query = $this->db->get(); - $parameters = $query->result(); - - $per_setup = array(); - // initialize the array - foreach ($setups as $setup) { - $per_setup[$setup] = array(); - } - // now fill with parameters - foreach ($parameters as $parameter) { - $per_setup[$parameter->setup][] = $parameter; - } - - return $per_setup; - } - function setup_list($segs) { $result_limit = 1000; $legal_filters = array('flow', 'setup', 'limit', 'offset', 'tag'); @@ -198,7 +172,7 @@ function setup_list($segs) { return; } - $per_setup = $this->_setup_ids_to_parameter_values($setups); + $per_setup = $this->Algorithm_setup->setup_ids_to_parameter_values($setups); $this->xmlContents('setup-list', $this->version, array('setups' => $per_setup, 'setup_flows' => $setup_flows)); } @@ -264,7 +238,11 @@ private function setup_exists($partial) { try { $setups = $this->Algorithm_setup->searchSetup($implementation, $parameters, $partial); } catch(Exception $e) { - $this->returnError(588, $this->version); + $additional_message = null; + if (substr($e->getMessage(), 0, 5) == '1116:') { + $additional_message = 'Flow might not be suitable for this operation (feature request)'; + } + $this->returnError(588, $this->version, $this->openmlGeneralErrorCode, $additional_message); return; } @@ -296,7 +274,7 @@ private function setup_exists($partial) { } // TODO: two-stage query, not ideal please fix! - $per_setup = $this->_setup_ids_to_parameter_values(array_keys($setup_flows)); + $per_setup = $this->Algorithm_setup->setup_ids_to_parameter_values(array_keys($setup_flows)); $this->xmlContents('setup-list', $this->version, array('setups' => $per_setup, 'setup_flows' => $setup_flows)); } diff --git a/openml_OS/models/api/v1/Api_task.php b/openml_OS/models/api/v1/Api_task.php index acf37cb2a..6e23ff0cc 100644 --- a/openml_OS/models/api/v1/Api_task.php +++ b/openml_OS/models/api/v1/Api_task.php @@ -192,7 +192,8 @@ private function task_inputs($task_id) { $this->returnError(157, $this->version); return; } - + + // TODO: tags! $this->xmlContents('task-inputs', $this->version, array('task' => $task, 'inputs' => $inputs)); } @@ -337,7 +338,7 @@ public function task_upload() { } if (!in_array($input_value, $acceptable_inputs)) { - $this->returnError(622, $this->version, $this->openmlGeneralErrorCode, 'problematic input: ' . $name); + $this->returnError(622, $this->version, $this->openmlGeneralErrorCode, 'problematic input: [' . $name . '], acceptable inputs: [' . implode(', ', $acceptable_inputs) . ']'); return; } } @@ -346,10 +347,21 @@ public function task_upload() { if ($name == 'source_data' /*|| (trim($constraints['select']) == 'did' && trim($constraints['from'] == 'dataset'))*/) { $status_record = $this->Dataset_status->getWhereSingle('did = ' . $input_value, '`status` DESC'); if (!$status_record || $status_record->status != 'active') { - $this->returnError(623, $this->version, $this->openmlGeneralErrorCode, 'problematic input: ' . $name); + $this->returnError(623, $this->version, $this->openmlGeneralErrorCode, 'problematic input: ' . $name . ', dataset not active.'); return; } } + // more hard constraints. should be replaced / refactored + if ($name == 'source_data_list' /*|| (trim($constraints['select']) == 'did' && trim($constraints['from'] == 'dataset'))*/) { + $input_values = json_decode($input_value); + foreach ($input_values as $v) { + $status_record = $this->Dataset_status->getWhereSingle('did = ' . $v, '`status` DESC'); + if (!$status_record || $status_record->status != 'active') { + $this->returnError(623, $this->version, $this->openmlGeneralErrorCode, 'problematic input: ' . $name . ', dataset not active: ' . $v); + return; + } + } + } // maybe a required input is satisfied unset($required_inputs[$name]); diff --git a/openml_OS/third_party/OpenML/Java/evaluate.jar b/openml_OS/third_party/OpenML/Java/evaluate.jar index c1717c93b..cba9e78bd 100644 Binary files a/openml_OS/third_party/OpenML/Java/evaluate.jar and b/openml_OS/third_party/OpenML/Java/evaluate.jar differ diff --git a/openml_OS/views/pages/api_new/v1/json/data.tpl.php b/openml_OS/views/pages/api_new/v1/json/data.tpl.php index 762eb41cd..b7ba61317 100644 --- a/openml_OS/views/pages/api_new/v1/json/data.tpl.php +++ b/openml_OS/views/pages/api_new/v1/json/data.tpl.php @@ -8,6 +8,7 @@ "version":version; ?>, "status":"status; ?>", "format":"format; ?>", + "md5_checksum":"md5_checksum; ?>", file_id != null): /* optional field! */?> "file_id": file_id; ?>, diff --git a/openml_OS/views/pages/api_new/v1/json/evaluations.tpl.php b/openml_OS/views/pages/api_new/v1/json/evaluations.tpl.php index 80fcc5899..29a3045cd 100644 --- a/openml_OS/views/pages/api_new/v1/json/evaluations.tpl.php +++ b/openml_OS/views/pages/api_new/v1/json/evaluations.tpl.php @@ -8,6 +8,22 @@ "setup_id":sid; ?>, "flow_id":implementation_id; ?>, "flow_name":"fullName; ?>", + + "parameters": [ + parameters); ++$i): ?> + parameters[$i]; + if ($i>0) echo ","; ?> + {"id": id); ?>, + "flow_id": implementation_id); ?>, + "flow_name": "flow_name); ?>", + "full_name": "flow_fullName) . '_' . htmlspecialchars($p->name); ?>", + "parameter_name": "name); ?>", + "data_type": "dataType); ?>", + "default_value": "defaultValue); ?>", + "value": "value); ?>"} + + ], + "data_name":"name; ?>", "function":"{'function'}; ?>", "upload_time":"start_time; ?>" diff --git a/openml_OS/views/pages/api_new/v1/xml/data-get.tpl.php b/openml_OS/views/pages/api_new/v1/xml/data-get.tpl.php index a1a59b4e9..0595be7fe 100644 --- a/openml_OS/views/pages/api_new/v1/xml/data-get.tpl.php +++ b/openml_OS/views/pages/api_new/v1/xml/data-get.tpl.php @@ -5,9 +5,9 @@ - - - + + + diff --git a/openml_OS/views/pages/api_new/v1/xml/evaluations.tpl.php b/openml_OS/views/pages/api_new/v1/xml/evaluations.tpl.php index d11116b28..d1a860c13 100644 --- a/openml_OS/views/pages/api_new/v1/xml/evaluations.tpl.php +++ b/openml_OS/views/pages/api_new/v1/xml/evaluations.tpl.php @@ -7,6 +7,22 @@ sid; ?> implementation_id; ?> fullName; ?> + + + parameters as $p): ?> + + id); ?> + implementation_id); /*important! this is different from $p->flow_id; */?> + flow_name); ?> + flow_fullName) . '_' . htmlspecialchars($p->name); ?> + name); ?> + dataType); ?> + defaultValue); ?> + value); ?> + + + + did != null): ?>did; ?> name != null): ?>name; ?> evaluation_engine_id; ?> diff --git a/openml_OS/views/pages/api_new/v1/xml/pre.php b/openml_OS/views/pages/api_new/v1/xml/pre.php index 25c38a4b1..e4bbf62a7 100644 --- a/openml_OS/views/pages/api_new/v1/xml/pre.php +++ b/openml_OS/views/pages/api_new/v1/xml/pre.php @@ -281,6 +281,7 @@ $this->apiErrors[514] = 'Illegal filter specified'; $this->apiErrors[515] = 'Can not specify offset without limit'; $this->apiErrors[516] = 'Requested result limit too high. '; +$this->apiErrors[517] = 'Problem with the study filter. The study should exists, be run-based and non-legacy'; // openml.estimationprocedure.list $this->apiErrors[520] = 'No results'; @@ -301,6 +302,9 @@ $this->apiErrors[550] = 'Admin rights are required.'; $this->apiErrors[551] = 'Delete query failed.'; +// openml.evaluations.list [continued] +$this->apiErrors[555] = 'Problem with the study filter. The study should exists, be run-based and non-legacy'; + // openml.run.trace.upload $this->apiErrors[561] = 'Problem with uploaded trace file.'; $this->apiErrors[562] = 'Problem validating xml trace file.'; diff --git a/openml_OS/views/pages/frontend/cite/body.php b/openml_OS/views/pages/frontend/cite/body.php index a181704dd..29b81c791 100644 --- a/openml_OS/views/pages/frontend/cite/body.php +++ b/openml_OS/views/pages/frontend/cite/body.php @@ -27,7 +27,23 @@

-

If you have used the OpenML R package, please also cite the following paper:

+

If you have used the OpenML Python package, please also cite:

+

Matthias Feurer, Jan N. van Rijn, Arlind Kadra, Pieter Gijsbers, Neeratyoy Mallik, Sahithya Ravi, Andreas Mueller, Joaquin Vanschoren, Frank Hutter. + OpenML-Python: an extensible Python API for OpenML. arXiv:1911.02490 [cs.LG], 2019
+ Show BibTeX - Read on arXiv + +

+ @article{OpenMLPython2019,
+ author = {Matthias Feurer, Jan N. van Rijn, Arlind Kadra, Pieter Gijsbers, Neeratyoy Mallik, Sahithya Ravi, Andreas Mueller, Joaquin Vanschoren, Frank Hutter},
+ title = {OpenML-Python: an extensible Python API for OpenML},
+ journal = {arXiv},
+ volume = {1911.02490},
+ url = {https://arxiv.org/pdf/1911.02490.pdf},
+ } +
+

+ +

If you have used the OpenML R package, please also cite:

Giuseppe Casalicchio, Jakob Bossek, Michel Lang, Dominik Kirchhoff, Pascal Kerschke, Benjamin Hofner, Heidi Seibold, Joaquin Vanschoren, Bernd Bischl. OpenML: An R package to connect to the machine learning platform OpenML. Computational Statistics 32 (3), pp 1-15, 2017
Show BibTeX - Read on arXiv diff --git a/openml_OS/views/pages/frontend/home/body.php b/openml_OS/views/pages/frontend/home/body.php index 275e37c4b..70cf7adda 100644 --- a/openml_OS/views/pages/frontend/home/body.php +++ b/openml_OS/views/pages/frontend/home/body.php @@ -32,35 +32,34 @@

- icon + icon
-

HACKATHON

-

Bring your own data, bring your own algorithms, or build cool new features.

-

15-18 April 2019, Den Bosch, The Netherlands. Register now!

+

Python API 0.10 is released

+

Get started now or read the paper first :)

- icon + icon
-

OpenML on Open Science Radio

-

Check out the podcast.

+

HACKATHON

+

Bring your own data, bring your own algorithms, or build cool new features.

+

Next workshop: March 2020, Munich, Germany

- icon + icon