From 847a81a09fcb5b811b7bc6643e278c41b3d329c6 Mon Sep 17 00:00:00 2001 From: Richard Wu Date: Sat, 16 Feb 2019 18:31:40 -0500 Subject: [PATCH 1/3] Store domain as a Postgres array type and keep as list in memory. Remove domain_size column. --- dataset/dataset.py | 4 ++-- domain/domain.py | 14 +++++--------- domain/estimators/logistic.py | 10 +++++----- domain/estimators/naive_bayes.py | 2 +- repair/featurize/featurized_dataset.py | 2 +- repair/featurize/freqfeat.py | 4 ++-- repair/featurize/initsimfeat.py | 4 ++-- repair/featurize/langmodelfeat.py | 4 ++-- repair/featurize/occurattrfeat.py | 2 +- 9 files changed, 21 insertions(+), 25 deletions(-) diff --git a/dataset/dataset.py b/dataset/dataset.py index ee1c5ae4e..3c1ac9ccc 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -267,7 +267,7 @@ def get_domain_info(self): """ Returns (number of random variables, count of distinct values across all attributes). """ - query = 'SELECT count(_vid_), max(domain_size) FROM %s'%AuxTables.cell_domain.name + query = 'SELECT COUNT(_vid_), MAX(LENGTH(domain)) FROM %s'%AuxTables.cell_domain.name res = self.engine.execute_query(query) total_vars = int(res[0][0]) classes = int(res[0][1]) @@ -279,7 +279,7 @@ def get_inferred_values(self): query = "SELECT t1._tid_, t1.attribute, domain[inferred_val_idx + 1] as rv_value " \ "FROM " \ "(SELECT _tid_, attribute, " \ - "_vid_, init_value, string_to_array(regexp_replace(domain, \'[{\"\"}]\', \'\', \'gi\'), \'|||\') as domain " \ + "_vid_, init_value, domain" \ "FROM %s) as t1, %s as t2 " \ "WHERE t1._vid_ = t2._vid_"%(AuxTables.cell_domain.name, AuxTables.inf_values_idx.name) self.generate_aux_table_sql(AuxTables.inf_values_dom, query, index_attrs=['_tid_']) diff --git a/domain/domain.py b/domain/domain.py index 4dbce5e93..c8afa2ad1 100644 --- a/domain/domain.py +++ b/domain/domain.py @@ -124,7 +124,7 @@ def store_domains(self, domain): self.ds.generate_aux_table(AuxTables.cell_domain, domain, store=True, index_attrs=['_vid_']) self.ds.aux_table[AuxTables.cell_domain].create_db_index(self.ds.engine, ['_tid_']) self.ds.aux_table[AuxTables.cell_domain].create_db_index(self.ds.engine, ['_cid_']) - query = "SELECT _vid_, _cid_, _tid_, attribute, a.rv_val, a.val_id from %s , unnest(string_to_array(regexp_replace(domain,\'[{\"\"}]\',\'\',\'gi\'),\'|||\')) WITH ORDINALITY a(rv_val,val_id)" % AuxTables.cell_domain.name + query = "SELECT _vid_, _cid_, _tid_, attribute, a.rv_val, a.val_id from %s , unnest(domain::TEXT[]) WITH ORDINALITY a(rv_val,val_id)" % AuxTables.cell_domain.name self.ds.generate_aux_table_sql(AuxTables.pos_values, query, index_attrs=['_tid_', 'attribute']) def setup_attributes(self): @@ -219,8 +219,7 @@ def generate_domain(self): _cid_: cell ID (unique for every entity-attribute) _vid_: variable ID (1-1 correspondence with _cid_) attribute: attribute name - domain: ||| separated string of domain values - domain_size: length of domain + domain: array of domain values init_value: initial value for this cell init_value_idx: domain index of init_value fixed: 1 if a random sample was taken since no correlated attributes/top K values @@ -252,8 +251,7 @@ def generate_domain(self): "attribute": attr, "_cid_": cid, "_vid_": vid, - "domain": "|||".join(dom), - "domain_size": len(dom), + "domain": dom, "init_value": init_value, "init_index": init_value_idx, "weak_label": weak_label, @@ -270,8 +268,7 @@ def generate_domain(self): "attribute": attr, "_cid_": cid, "_vid_": vid, - "domain": "|||".join(dom), - "domain_size": len(dom), + "domain": dom, "init_value": init_value, "init_index": init_value_idx, "weak_label": init_value, @@ -323,8 +320,7 @@ def generate_domain(self): domain_values.append(row['init_value']) domain_values = sorted(domain_values) # update our memoized domain values for this row again - row['domain'] = '|||'.join(domain_values) - row['domain_size'] = len(domain_values) + row['domain'] = domain_values row['weak_label_idx'] = domain_values.index(row['weak_label']) row['init_index'] = domain_values.index(row['init_value']) diff --git a/domain/estimators/logistic.py b/domain/estimators/logistic.py index ca247a7b4..abd05f7e6 100644 --- a/domain/estimators/logistic.py +++ b/domain/estimators/logistic.py @@ -25,7 +25,7 @@ def __init__(self, env, dataset, domain_df, active_attrs, batch_size=32): """ :param dataset: (Dataset) original dataset :param domain_df: (DataFrame) currently populated domain dataframe. - Required columns are: _vid_, _tid_, attribute, domain, domain_size, init_value + Required columns are: _vid_, _tid_, attribute, domain, init_value :param active_attrs: (list[str]) attributes that have random values """ torch.nn.Module.__init__(self) @@ -40,7 +40,7 @@ def __init__(self, env, dataset, domain_df, active_attrs, batch_size=32): # self.dom maps tid --> attr --> list of domain values # we need to find the number of domain values we will be generating # a training sample for. - self.n_samples = int(domain_df['domain_size'].sum()) + self.n_samples = int(domain_df['domain'].apply(len).sum()) # Create and initialize featurizers. self.featurizers = [CooccurAttrFeaturizer(self.ds)] @@ -86,7 +86,7 @@ def _gen_training_data(self): self.vid_to_idxs = {} for rec in tqdm(list(self.domain_records)): init_row = raw_data_dict[rec['_tid_']] - domain_vals = rec['domain'].split('|||') + domain_vals = rec['domain'] # Generate the feature tensor for all the domain values for this # cell. @@ -166,7 +166,7 @@ def predict_pp(self, row, attr=None, values=None): start_idx, end_idx = self.vid_to_idxs[row['_vid_']] pred_X = self._X[start_idx:end_idx] pred_Y = self.forward(pred_X) - values = self.domain_records[row['_vid_']]['domain'].split('|||') + values = self.domain_records[row['_vid_']]['domain'] return zip(values, map(float, pred_Y)) def predict_pp_batch(self, raw_records_by_tid=None, cell_domain_rows=None): @@ -179,7 +179,7 @@ def predict_pp_batch(self, raw_records_by_tid=None, cell_domain_rows=None): """ pred_Y = self.forward(self._X) for rec in self.domain_records: - values = rec['domain'].split('|||') + values = rec['domain'] start_idx, end_idx = self.vid_to_idxs[rec['_vid_']] yield zip(values, map(float, pred_Y[start_idx:end_idx])) diff --git a/domain/estimators/naive_bayes.py b/domain/estimators/naive_bayes.py index c3235dfa4..21c8ff2cf 100644 --- a/domain/estimators/naive_bayes.py +++ b/domain/estimators/naive_bayes.py @@ -66,7 +66,7 @@ def predict_pp_batch(self): val is the domain value and proba is the estimator's posterior probability estimate. """ for row in tqdm(self.domain_df.to_records()): - yield self.predict_pp(self._raw_records_by_tid[row['_tid_']], row['attribute'], row['domain'].split('|||')) + yield self.predict_pp(self._raw_records_by_tid[row['_tid_']], row['attribute'], row['domain']) def _get_corr_attributes(self, attr): """ diff --git a/repair/featurize/featurized_dataset.py b/repair/featurize/featurized_dataset.py index 2ce666bd2..724aa33fe 100644 --- a/repair/featurize/featurized_dataset.py +++ b/repair/featurize/featurized_dataset.py @@ -101,7 +101,7 @@ def generate_var_mask(self): is valid for the i-th VID and tensor[i][j] = -10e6 otherwise. """ var_to_domsize = {} - query = 'SELECT _vid_, domain_size FROM %s' % AuxTables.cell_domain.name + query = 'SELECT _vid_, LENGTH(domain) FROM %s' % AuxTables.cell_domain.name res = self.ds.engine.execute_query(query) mask = torch.zeros(self.total_vars,self.classes) for tuple in tqdm(res): diff --git a/repair/featurize/freqfeat.py b/repair/featurize/freqfeat.py index 1e9656560..1c1f5e7c6 100644 --- a/repair/featurize/freqfeat.py +++ b/repair/featurize/freqfeat.py @@ -16,7 +16,7 @@ def specific_setup(self): def gen_feat_tensor(self, input, classes): vid = int(input[0]) attribute = input[1] - domain = input[2].split('|||') + domain = input[2] attr_idx = self.ds.attr_to_idx[attribute] tensor = torch.zeros(1, classes, self.attrs_number) for idx, val in enumerate(domain): @@ -25,7 +25,7 @@ def gen_feat_tensor(self, input, classes): return tensor def create_tensor(self): - query = 'SELECT _vid_, attribute, domain FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name + query = 'SELECT _vid_, attribute, domain::TEXT[] FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name results = self.ds.engine.execute_query(query) tensors = [self.gen_feat_tensor(res, self.classes) for res in results] combined = torch.cat(tensors) diff --git a/repair/featurize/initsimfeat.py b/repair/featurize/initsimfeat.py index 364e0f471..b4d6f4bd9 100644 --- a/repair/featurize/initsimfeat.py +++ b/repair/featurize/initsimfeat.py @@ -13,7 +13,7 @@ def gen_feat_tensor(input, classes, total_attrs): init_value = input[2] # TODO: To add more similarity metrics increase the last dimension of tensor. tensor = torch.zeros(1, classes, total_attrs) - domain = input[3].split('|||') + domain = input[3] for idx, val in enumerate(domain): if val == init_value: sim = -1.0 @@ -53,7 +53,7 @@ def specific_setup(self): raise ValueError("The size of init_weight for InitSimFeaturizer %d does not match the number of attributes %d." % (self.init_weight.shape[0], len(self.all_attrs))) def create_tensor(self): - query = 'SELECT _vid_, attribute, init_value, domain FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name + query = 'SELECT _vid_, attribute, init_value, domain::TEXT[] FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name results = self.ds.engine.execute_query(query) map_input = [] for res in results: diff --git a/repair/featurize/langmodelfeat.py b/repair/featurize/langmodelfeat.py index 7332daf18..fac4c9a97 100644 --- a/repair/featurize/langmodelfeat.py +++ b/repair/featurize/langmodelfeat.py @@ -21,7 +21,7 @@ def specific_setup(self): def gen_feat_tensor(self, input, classes): vid = int(input[0]) attribute = input[1] - domain = input[2].split('|||') + domain = input[2] attr_idx = self.ds.attr_to_idx[attribute] model = self.attr_language_model[attribute] tensor = torch.zeros(1, classes, self.attrs_number*self.emb_size) @@ -33,7 +33,7 @@ def gen_feat_tensor(self, input, classes): return tensor def create_tensor(self): - query = 'SELECT _vid_, attribute, domain FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name + query = 'SELECT _vid_, attribute, domain::TEXT[] FROM %s ORDER BY _vid_' % AuxTables.cell_domain.name results = self.ds.engine.execute_query(query) tensors = [self.gen_feat_tensor(res, self.classes) for res in results] combined = torch.cat(tensors) diff --git a/repair/featurize/occurattrfeat.py b/repair/featurize/occurattrfeat.py index 35f02cfac..e08b0e151 100644 --- a/repair/featurize/occurattrfeat.py +++ b/repair/featurize/occurattrfeat.py @@ -47,7 +47,7 @@ def create_tensor(self): def gen_feat_tensor(self, row, tuple): tensor = torch.zeros(1, self.classes, self.attrs_number*self.attrs_number) rv_attr = row['attribute'] - domain = row['domain'].split('|||') + domain = row['domain'] rv_domain_idx = {val: idx for idx, val in enumerate(domain)} rv_attr_idx = self.ds.attr_to_idx[rv_attr] for attr in self.all_attrs: From 9a65495c7bdce19870cf0020240fcf677f16e375 Mon Sep 17 00:00:00 2001 From: Richard Wu Date: Mon, 4 Mar 2019 18:21:03 -0500 Subject: [PATCH 2/3] Missing TEXT[] cast in SQL query --- dataset/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataset/dataset.py b/dataset/dataset.py index 3c1ac9ccc..b57c7d9c2 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -279,7 +279,7 @@ def get_inferred_values(self): query = "SELECT t1._tid_, t1.attribute, domain[inferred_val_idx + 1] as rv_value " \ "FROM " \ "(SELECT _tid_, attribute, " \ - "_vid_, init_value, domain" \ + "_vid_, init_value, domain::TEXT[] " \ "FROM %s) as t1, %s as t2 " \ "WHERE t1._vid_ = t2._vid_"%(AuxTables.cell_domain.name, AuxTables.inf_values_idx.name) self.generate_aux_table_sql(AuxTables.inf_values_dom, query, index_attrs=['_tid_']) From bd48158dbb2f3f323df520cc524df2e19b1a6913 Mon Sep 17 00:00:00 2001 From: Richard Wu Date: Mon, 4 Mar 2019 18:21:15 -0500 Subject: [PATCH 3/3] temp for MET dataset --- examples/holoclean_repair_example.py | 13 ++++++------- testdata/met_constraints.txt | 22 ++++++++++------------ 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/examples/holoclean_repair_example.py b/examples/holoclean_repair_example.py index d9c78d0ba..60c8afc62 100644 --- a/examples/holoclean_repair_example.py +++ b/examples/holoclean_repair_example.py @@ -27,8 +27,8 @@ ).session # 2. Load training data and denial constraints. -hc.load_data('hospital', '../testdata/hospital.csv') -hc.load_dcs('../testdata/hospital_constraints.txt') +hc.load_data('hospital', '../testdata/met_1000_transformed.csv') +hc.load_dcs('../testdata/met_constraints.txt') hc.ds.set_constraints(hc.get_dcs()) # 3. Detect erroneous cells using these two detectors. @@ -42,13 +42,12 @@ OccurAttrFeaturizer(), FreqFeaturizer(), ConstraintFeaturizer(), - LangModelFeaturizer(), ] hc.repair_errors(featurizers) # 5. Evaluate the correctness of the results. -hc.evaluate(fpath='../testdata/hospital_clean.csv', - tid_col='tid', - attr_col='attribute', - val_col='correct_val') +# hc.evaluate(fpath='../testdata/hospital_clean.csv', +# tid_col='tid', +# attr_col='attribute', +# val_col='correct_val') diff --git a/testdata/met_constraints.txt b/testdata/met_constraints.txt index 4d0160ffb..ab02ddd60 100644 --- a/testdata/met_constraints.txt +++ b/testdata/met_constraints.txt @@ -1,14 +1,12 @@ -t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.City,t2.City) -t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.State,t2.State) -t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.Country,t2.Country) -t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.City,t2.City) -t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.State,t2.State) -t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.Country,t2.Country) -t1&t2&EQ(t1.River,t2.River)&IQ(t1.Country,t2.Country) -t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.City,t2.City) -t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.State,t2.State) -t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.Country,t2.Country) +# t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.City,t2.City) +# t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.State,t2.State) +# t1&t2&EQ(t1.Region,t2.Region)&IQ(t1.Country,t2.Country) +# t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.City,t2.City) +# t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.State,t2.State) +# t1&t2&EQ(t1.Subregion,t2.Subregion)&IQ(t1.Country,t2.Country) +# t1&t2&EQ(t1.River,t2.River)&IQ(t1.Country,t2.Country) +# t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.City,t2.City) +# t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.State,t2.State) +# t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.Country,t2.Country) t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.Dynasty,t2.Dynasty) t1&t2&EQ(t1.Reign,t2.Reign)&IQ(t1.Period,t2.Period) - -