Algebraic-Word-Problem-Solver/preprocessor.py at master · wangxr14/Algebraic-Word-Problem-Solver · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import json
import os
from nltk.tree import ParentedTree
from nltk.parse import stanford
from nltk.tokenize import sent_tokenize
from quantity_schema2 import *
from utils import *

# TODO: pretty print in json
class Preprocessor:
  def __init__(self, rawProblemFile, debug=False):
    self.debug = debug
    with open(rawProblemFile, 'r') as f:
      self.problems = json.load(f)
    os.environ['STANFORD_PARSER'] = '/Users/liming/nltk_data/stanford-parser-full-2018-10-17/'
    os.environ['STANFORD_MODELS'] = '/Users/liming/nltk_data/stanford-parser-full-2018-10-17/'

    self.pcfgParser = stanford.StanfordParser(model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
    self.depParser = stanford.StanfordDependencyParser()

  def prepare(self, problemFile):
    prblmDict = self.extractQuantitySchema()
    with open(problemFile, 'w') as f:
      json.dump(prblmDict, f, indent=4, sort_keys=True)

  def extractQuantitySchema(self):
    qSchemas = []
    for i, problem in enumerate(self.problems):
      print('Problem', i)
      # TODO: Use the tokenizer instead: can be a problem when dealing with
      # floating point values
      sents = nltk.sent_tokenize(problem['sQuestion'])
      pSents = self.pcfgParser.raw_parse_sents(sents)
      dSents = self.depParser.raw_parse_sents(sents)

      schemas = {'quantities':[], 'quantity_schema':[]}
      for j, (pline, dline) in enumerate(zip(pSents, dSents)):
        for psent, dsent in zip(pline, dline):
          pcfgTree = ParentedTree.fromstring(str(psent))
          depTree = dsent
          schema = dealWithSentence(pcfgTree, depTree)
          # Label each verb with the sentence order
          for k, q_schema in enumerate(schema['quantity_schema']):
            q_schema['verb'] = str(j) + '_' + q_schema['verb']
            schemas['quantity_schema'].append(q_schema)

          schemas['quantities'] += schema['quantities']

        n = len(schemas['quantities'])
        # Compensate for the missing units
        for k in range(n):
          if not schemas['quantity_schema'][k]['unit']:
              found = 0

              for l in range(k):
                if self.debug:
                  print('Detected missing unit, replaced with: ', schemas['quantity_schema'][k-l-1]['unit'])

                if schemas['quantity_schema'][k-l-1]['unit']:
                  schemas['quantity_schema'][k]['unit'] = schemas['quantity_schema'][k-l-1]['unit']
                  found = 1
                  break
              if found:
                if self.debug:
                  print('Found missing unit')
                continue
              else:
                for l in range(n - k):
                  if schemas['quantity_schema'][k+l]['unit']:
                    schemas['quantity_schema'][k]['unit'] = schemas['quantity_schema'][k+l]['unit']
                    break

      schemas['question'] = self.findQuestion(sents)

      #if self.debug:
      quantities = schemas['quantities']
      print(quantities, schemas)
      qSchemas.append(schemas)
    return qSchemas

  def findQuestion(self, sents):
    # Find the last sentence
    question = sents[-1]
    # If the question has conditions, ignore them and find the surface
    if ',' in question:
      question = question.split(',')[-1]
    return question

  def addSchema(self, schema, schemas):
    qs = schema["quantities"]
    qSchemas = schema["quantity_schema"]
    for i, qSchema in enumerate(qSchemas):
      schemas['quantity_schema'].append(qSchema)

    schemas['quantities'] += qs

    return schemas

#  def extractQuantities(self):
#    quantities = []
#    for problem in self.problems:
#      aligns = problem['lAlignments']
#      sents = problem['']
#def dependencyParing():
#def posTagging():

if __name__ == "__main__":
  prep = Preprocessor("data/add_sub/AddSub.json", True)
  prep.prepare("data/add_sub/schema_all_test.json")