Remove transpose() in get_data. Closes #130

Remove the transpose() in get_data, to adhere to sklearn convention. Our datasets on disk don't need to change. Update our grammars to use x[:, 0] style (not x[0]). New grammars must do this also. Added error checking to give user useful error message on this. Improve error checking with OPTIMIZE_CONSTANTS also.
PonyGE · Oct 17, 2021 · e1b9197 · e1b9197
1 parent ddb763d
commit e1b9197
Show file tree

Hide file tree

Showing 11 changed files with 43 additions and 18 deletions.
diff --git a/grammars/supervised_learning/Banknote.bnf b/grammars/supervised_learning/Banknote.bnf
@@ -2,6 +2,6 @@
 <op> ::= + | * | -
 <f1> ::= psqrt | plog
 <f2> ::= pdiv
-<v> ::= x[<idx>]
+<v> ::= x[:, <idx>]
 <idx> ::= 0 | 1 | 2 | 3
 <c> ::= -1.0 | -0.1 | -0.01 | -0.001 | 0.001 | 0.01 | 0.1 | 1.0
diff --git a/grammars/supervised_learning/Dow.bnf b/grammars/supervised_learning/Dow.bnf
@@ -1,2 +1,2 @@
-<e>   ::= <e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|x[0]|x[1]|x[2]|x[3]|x[4]|x[5]|x[6]|x[7]|x[8]|x[9]|x[10]|x[11]|x[12]|x[13]|x[14]|x[15]|x[16]|x[17]|x[18]|x[19]|x[20]|x[21]|x[22]|x[23]|x[24]|x[25]|x[26]|x[27]|x[28]|x[29]|x[30]|x[31]|x[32]|x[33]|x[34]|x[35]|x[36]|x[37]|x[38]|x[39]|x[40]|x[41]|x[42]|x[43]|x[44]|x[45]|x[46]|x[47]|x[48]|x[49]|x[50]|x[51]|x[52]|x[53]|x[54]|x[55]|x[56]|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>
+<e>   ::= <e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|<e>+<e>|<e>-<e>|<e>*<e>|pdiv(<e>,<e>)|psqrt(<e>)|np.sin(<e>)|np.tanh(<e>)|np.exp(<e>)|plog(<e>)|x[:, 0]|x[:, 1]|x[:, 2]|x[:, 3]|x[:, 4]|x[:, 5]|x[:, 6]|x[:, 7]|x[:, 8]|x[:, 9]|x[:, 10]|x[:, 11]|x[:, 12]|x[:, 13]|x[:, 14]|x[:, 15]|x[:, 16]|x[:, 17]|x[:, 18]|x[:, 19]|x[:, 20]|x[:, 21]|x[:, 22]|x[:, 23]|x[:, 24]|x[:, 25]|x[:, 26]|x[:, 27]|x[:, 28]|x[:, 29]|x[:, 30]|x[:, 31]|x[:, 32]|x[:, 33]|x[:, 34]|x[:, 35]|x[:, 36]|x[:, 37]|x[:, 38]|x[:, 39]|x[:, 40]|x[:, 41]|x[:, 42]|x[:, 43]|x[:, 44]|x[:, 45]|x[:, 46]|x[:, 47]|x[:, 48]|x[:, 49]|x[:, 50]|x[:, 51]|x[:, 52]|x[:, 53]|x[:, 54]|x[:, 55]|x[:, 56]|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>
 <c>  ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
diff --git a/grammars/supervised_learning/Keijzer6.bnf b/grammars/supervised_learning/Keijzer6.bnf
@@ -7,6 +7,6 @@
           np.tanh(<e>)|
           np.exp(<e>)|
           plog(<e>)|
-          x[0]|x[0]|x[0]|x[0]|
+          x[:, 0]|x[:, 0]|x[:, 0]|x[:, 0]|
           <c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>|<c><c>.<c><c>
 <c>  ::= 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9
diff --git a/grammars/supervised_learning/boolean.bnf b/grammars/supervised_learning/boolean.bnf
@@ -1,5 +1,5 @@
 <e>  ::=  (<e> & <e>) |
 	 	  (<e> "|" <e>) |
 		  ~<e> |
-		  x[<varidx>]
+		  x[:, <varidx>]
 <varidx> ::= GE_RANGE:dataset_n_vars
diff --git a/grammars/supervised_learning/if_else_classifier.bnf b/grammars/supervised_learning/if_else_classifier.bnf
@@ -2,5 +2,5 @@
 <cond> ::= (<var> == <is>) | <cond> & (<var> == <is>)
 <is> ::= GE_RANGE:dataset_n_is
 <os> ::= GE_RANGE:dataset_n_os
-<var> ::= x[<varidx>]
+<var> ::= x[:, <varidx>]
 <varidx> ::= GE_RANGE:dataset_n_vars
diff --git a/grammars/supervised_learning/supervised_learning.bnf b/grammars/supervised_learning/supervised_learning.bnf
@@ -2,7 +2,7 @@
 	 	  (<e>-<e>) |
 		  (<e>*<e>) |
 		  aq(<e>,<e>) |
-		  x[<varidx>] |
+		  x[:, <varidx>] |
 		  <c>
 <varidx> ::= GE_RANGE:dataset_n_vars
 <c> ::= <d>.<d> | -<d>.<d>

diff --git a/grammars/supervised_learning/supervised_learning_consts.bnf b/grammars/supervised_learning/supervised_learning_consts.bnf
@@ -2,6 +2,6 @@
 	 	  (<e>-<e>) |
 		  (<e>*<e>) |
 		  aq(<e>,<e>) |
-		  x[<idx>] |
+		  x[:, <idx>] |
 		  c[<idx>] # we allow limited number of constants: same as number of variables
 <idx> ::= GE_RANGE:dataset_n_vars
diff --git a/src/algorithm/parameters.py b/src/algorithm/parameters.py
@@ -349,9 +349,13 @@ def set_params(command_line_args, create_files=True):
                     raise Exception(s)
 
         # Parse grammar file and set grammar class.
-        params['BNF_GRAMMAR'] = grammar.Grammar(path.join("..", "grammars",
-                                                          params[
-                                                              'GRAMMAR_FILE']))
+        params['BNF_GRAMMAR'] = grammar.Grammar(
+            path.join("..", "grammars", params['GRAMMAR_FILE']))
+
+        # If OPTIMIZE_CONSTANTS, check that the grammar is suitable
+        if params['OPTIMIZE_CONSTANTS']:
+            if "c[" not in params['BNF_GRAMMAR'].terminals:
+                raise ValueError("Grammar unsuitable for OPTIMIZE_CONSTANTS")
 
         # Population loading for seeding runs (if specified)
         if params['TARGET_SEED_FOLDER']:

diff --git a/src/fitness/supervised_learning/supervised_learning.py b/src/fitness/supervised_learning/supervised_learning.py
@@ -35,7 +35,7 @@ def __init__(self):
             get_data(params['DATASET_TRAIN'], params['DATASET_TEST'])
 
         # Find number of variables.
-        self.n_vars = np.shape(self.training_in)[0]
+        self.n_vars = np.shape(self.training_in)[1] # sklearn convention
 
         # Regression/classification-style problems use training and test data.
         if params['DATASET_TEST']:
@@ -68,6 +68,10 @@ def evaluate(self, ind, **kwargs):
         else:
             raise ValueError("Unknown dist: " + dist)
 
+        shape_mismatch_txt = """Shape mismatch between y and yhat. Please check
+that your grammar uses the `x[:, 0]` style, not `x[0]`. Please see change
+at https://github.com/PonyGE/PonyGE2/issues/130."""
+
         if params['OPTIMIZE_CONSTANTS']:
             # if we are training, then optimize the constants by
             # gradient descent and save the resulting phenotype
@@ -85,6 +89,8 @@ def evaluate(self, ind, **kwargs):
                 # phen will refer to x (ie test_in), and possibly to c
                 yhat = eval(phen)
                 assert np.isrealobj(yhat)
+                if y.shape != yhat.shape:
+                    raise ValueError(shape_mismatch_txt)
 
                 # let's always call the error function with the
                 # true values first, the estimate second
@@ -94,6 +100,8 @@ def evaluate(self, ind, **kwargs):
             # phenotype won't refer to C
             yhat = eval(ind.phenotype)
             assert np.isrealobj(yhat)
+            if y.shape != yhat.shape:
+                raise ValueError(shape_mismatch_txt)
 
             # let's always call the error function with the true
             # values first, the estimate second

diff --git a/src/utilities/fitness/get_data.py b/src/utilities/fitness/get_data.py
@@ -7,7 +7,8 @@
 def get_Xy_train_test_separate(train_filename, test_filename, skip_header=0):
     """
     Read in training and testing data files, and split each into X
-    (all columns up to last) and y (last column).
+    (all columns up to last) and y (last column). The data files should
+    contain one row per training example.
     
     :param train_filename: The file name of the training dataset.
     :param test_filename: The file name of the testing dataset.
@@ -57,8 +58,8 @@ def get_Xy_train_test_separate(train_filename, test_filename, skip_header=0):
 
     try:
         # Separate out input (X) and output (y) data.
-        train_X = train_Xy[:, :-1].transpose()  # all columns but last
-        train_y = train_Xy[:, -1].transpose()  # last column
+        train_X = train_Xy[:, :-1] # all columns but last
+        train_y = train_Xy[:, -1]  # last column
 
     except IndexError:
         s = "utilities.fitness.get_data.get_Xy_train_test_separate\n" \
@@ -72,8 +73,8 @@ def get_Xy_train_test_separate(train_filename, test_filename, skip_header=0):
                                 delimiter=delimiter)
 
         # Separate out input (X) and output (y) data.
-        test_X = test_Xy[:, :-1].transpose()  # all columns but last
-        test_y = test_Xy[:, -1].transpose()  # last column
+        test_X = test_Xy[:, :-1] # all columns but last
+        test_y = test_Xy[:, -1]  # last column
 
     else:
         test_X, test_y = None, None

diff --git a/src/utilities/fitness/optimize_constants.py b/src/utilities/fitness/optimize_constants.py
@@ -32,10 +32,17 @@ def optimize_constants(x, y, ind):
     # Pre-load the error metric fitness function.
     loss = params['ERROR_METRIC']
 
+    shape_mismatch_txt = """Shape mismatch between y and yhat. Please check
+that your grammar uses the `x[:, 0]` style, not `x[0]`. Please see change
+at https://github.com/PonyGE/PonyGE2/issues/130."""
+
     if n_consts == 0:
         # ind doesn't refer to c: no need to optimize
         c = []
-        fitness = loss(y, f(x, c))
+        yhat = f(x, c)
+        if y.shape != yhat.shape:
+            raise ValueError(shape_mismatch_txt)
+        fitness = loss(y, yhat)
         ind.opt_consts = c
         return fitness
 
@@ -46,7 +53,12 @@ def optimize_constants(x, y, ind):
     # methods to try out.
     init = [0.0] * n_consts
 
-    res = scipy.optimize.minimize(obj, init, method="L-BFGS-B")
+    try:
+        res = scipy.optimize.minimize(obj, init, method="L-BFGS-B")
+    except ValueError:
+        raise ValueError("Error during optimization of constants. " \
+                         "Possible cause: " + shape_mismatch_txt)
+
 
     # the result is accessed like a dict
     ind.opt_consts = res['x']  # the optimum values of the constants