botsquad · arjan · Jun 23, 2020 · Jun 29, 2020 · Jun 29, 2020 · Jun 29, 2020
diff --git a/lib/bubble_match/sentence.ex b/lib/bubble_match/sentence.ex
@@ -35,53 +35,55 @@ defmodule BubbleMatch.Sentence do
   def naive_tokenize(input)
 
   def naive_tokenize("") do
-    %M{text: "", tokenizations: [[]]}
+    %M{text: "", tokenizations: new_graph()}
   end
 
   def naive_tokenize(input) when is_binary(input) do
     tokens = Tokenizer.tokenize(input)
-    %M{text: input, tokenizations: both_if_different(no_punct(tokens), tokens)}
+    graph = new_graph() |> build_token_graph(tokens)
+    %M{text: input, tokenizations: graph}
   end
 
   @doc """
-  Convert a JSON blob from Spacy NLP data into a list of sentences
+  Convert a JSON blob from Spacy NLP data into a sentence.
 
   This function takes the output of Spacy's [Doc.to_json][spacy]
-  function and converts it into a list of sentences.
+  function and creates a Sentence struct from it. Note that the struct
+  might actually contain more than one sentence.
 
   [spacy]: https://spacy.io/api/doc#to_json
   """
-  @spec sentences_from_spacy(spacy_json :: map()) :: [t()]
-  def sentences_from_spacy(spacy_json) do
-    spacy_sentences_split(spacy_json["sents"], spacy_json, [])
-    |> Enum.map(fn {text, tokens, entities} ->
-      %M{text: text, tokenizations: both_if_different(no_punct(tokens), tokens)}
-      |> add_spacy_entities(entities, spacy_json)
-    end)
-  end
+  @spec from_spacy(spacy_json :: map()) :: [t()]
+  def from_spacy(spacy_json) do
+    text = spacy_json["text"]
 
-  defp spacy_sentences_split([], _result, acc) do
-    Enum.reverse(acc)
-  end
+    tokens =
+      spacy_json["tokens"]
+      |> Enum.map(&Token.from_spacy/1)
 
-  defp spacy_sentences_split([%{"start" => start, "end" => end_} | rest], result, acc) do
-    s_text = String.slice(result["text"], start, end_ - start)
+    graph = new_graph()
+    sents = spacy_json["sents"]
 
-    s_tokens =
-      result["tokens"]
-      |> Enum.filter(&(&1["start"] >= start && &1["end"] <= end_))
-      |> Enum.map(&Token.from_spacy/1)
-      |> reindex()
+    # add all sentences
+    graph =
+      Enum.reduce(sents, graph, fn %{"start" => start, "end" => end_}, graph ->
+        ts = Enum.filter(tokens, &(&1.start >= start && &1.end <= end_))
+        build_token_graph(graph, ts)
+      end)
 
-    s_ents = result["ents"] |> Enum.filter(&(&1["start"] >= start && &1["end"] <= end_))
-    spacy_sentences_split(rest, result, [{s_text, s_tokens, s_ents} | acc])
-  end
+    [_ | pairs] = Enum.zip([nil | sents], sents)
 
-  defp add_spacy_entities(%M{} = m, [], _), do: m
+    # add edge between sentences
+    graph =
+      Enum.reduce(pairs, graph, fn {%{"end" => end_}, %{"start" => start}}, graph ->
+        {t_start, t_end} = find_start_end(graph, start, end_)
+        Graph.add_edge(graph, t_start, t_end)
+      end)
 
-  defp add_spacy_entities(%M{} = m, ents, %{"text" => text}) do
-    sequences = Enum.map(ents, &[Token.from_spacy_entity(&1, text)])
-    add_tokenization(m, sequences)
+    # add entities
+    ents = Enum.map(spacy_json["ents"], &Token.from_spacy_entity(&1, text))
+    graph = add_entities(graph, ents)
+    %M{text: text, tokenizations: graph}
   end
 
   @doc """
@@ -97,73 +99,132 @@ defmodule BubbleMatch.Sentence do
   def add_duckling_entities(%M{} = sentence, []), do: sentence
 
   def add_duckling_entities(%M{} = sentence, entities) do
-    sequences = Enum.map(entities, &[Token.from_duckling_entity(&1)])
-    add_tokenization(sentence, sequences)
-  end
+    ents = Enum.map(entities, &Token.from_duckling_entity(&1))
 
-  @doc false
-  def add_tokenization(%M{} = m, replace_token_sequences) do
-    raw_tokens = List.last(m.tokenizations)
+    graph = add_entities(sentence.tokenizations, ents)
+    %M{sentence | tokenizations: graph}
+  end
 
-    tokenization =
-      replace_token_sequences
-      |> Enum.reduce(raw_tokens, fn seq, toks ->
-        replace_tokens(toks, seq)
+  def skip_punct(%M{tokenizations: graph} = m) do
+    graph =
+      Enum.reduce(Graph.vertices(graph), graph, fn v, graph ->
+        connect_punct(graph, v, nil)
       end)
 
-    tokenizations = both_if_different(no_punct(tokenization), tokenization)
-    %M{m | tokenizations: Enum.uniq(tokenizations ++ m.tokenizations)}
+    %{m | tokenizations: graph}
   end
 
-  defp replace_tokens(token_sequence, replace_tokens) do
-    # find
-    start = List.first(replace_tokens).start
-    end_ = List.last(replace_tokens).end
+  ###
+
+  defp connect_punct(graph, v, first) do
+    case out_vertices(graph, v) |> Enum.split_with(&Token.punct?/1) do
+      {[], []} ->
+        graph
+
+      {p, []} ->
+        Enum.reduce(p, graph, fn v2, graph ->
+          connect_punct(graph, v2, first || v)
+        end)
+
+      {_, vs} ->
+        if first && not Token.punct?(first) do
+          Enum.reduce(vs, graph, fn v2, graph ->
+            Graph.add_edge(graph, first, v2)
+          end)
+        else
+          graph
+        end
+    end
+  end
 
-    start_idx = Enum.find_index(token_sequence, &(&1.start == start))
-    end_idx = Enum.find_index(token_sequence, &(&1.end == end_))
+  defp new_graph() do
+    Graph.new(type: :directed)
+    |> Graph.add_vertices([:start, :end])
+  end
 
-    cond do
-      start_idx != nil and end_idx != nil and end_idx >= start_idx ->
-        {a, _} = Enum.split(token_sequence, start_idx)
-        {_, b} = Enum.split(token_sequence, end_idx + 1)
+  defp build_token_graph(graph, tokens) do
+    build_token_graph(graph, tokens, :start)
+  end
 
-        (a ++ replace_tokens ++ b)
-        |> reindex()
+  defp build_token_graph(graph, [], _prev) do
+    graph
+  end
 
-      start_idx != nil and end_idx == nil ->
-        {a, _} = Enum.split(token_sequence, start_idx)
+  defp build_token_graph(graph, [last], prev) do
+    graph
+    |> Graph.add_vertices([last, prev, :end])
+    |> Graph.add_edge(prev, last)
+    |> Graph.add_edge(last, :end)
+  end
 
-        (a ++ replace_tokens)
-        |> reindex()
+  defp build_token_graph(graph, [a, b | rest], prev) do
+    graph
+    |> Graph.add_vertices([a, b])
+    |> Graph.add_edge(prev, a)
+    |> build_token_graph([b | rest], a)
+  end
 
-      start_idx == nil and end_idx != nil ->
-        {_, b} = Enum.split(token_sequence, end_idx + 1)
+  defp find_start_end(graph, start, end_) do
+    t_start =
+      Graph.vertices(graph)
+      |> Enum.find(&(is_map(&1) && &1.end == start - 1))
 
-        (replace_tokens ++ b)
-        |> reindex()
+    t_end =
+      Graph.vertices(graph)
+      |> Enum.find(&(is_map(&1) && (&1.start == end_ + 1 || &1.start == end_)))
 
-      true ->
-        # raise RuntimeError, "Token not found at start = #{start}, end = #{end_}"
-        token_sequence
-    end
+    {t_start, t_end}
   end
 
-  defp reindex(tokens) do
-    tokens
-    |> Enum.with_index()
-    |> Enum.map(fn {t, index} ->
-      %{t | index: index}
+  defp add_entities(graph, ents) do
+    Enum.reduce(ents, graph, fn %{start: start, end: end_} = ent, graph ->
+      {t_start, t_end} = find_start_end(graph, start, end_)
+      graph = Graph.add_vertex(graph, ent)
+
+      graph =
+        if t_start do
+          Graph.add_edge(graph, t_start, ent)
+        else
+          Graph.add_edge(graph, :start, ent)
+        end
+
+      graph =
+        if t_end do
+          Graph.add_edge(graph, ent, t_end)
+        else
+          Graph.add_edge(graph, ent, :end)
+        end
+
+      graph
     end)
   end
 
-  defp both_if_different(a, b, rest \\ [])
-  defp both_if_different(a, a, rest), do: [a | rest]
-  defp both_if_different(a, b, rest), do: [a, b | rest]
+  def out_vertices(graph, vertex) do
+    Graph.out_edges(graph, vertex)
+    |> Enum.map(fn e -> e.v2 end)
+  end
 
-  defp no_punct(tokens) do
-    tokens |> Enum.reject(&Token.punct?/1)
+  def make_dot(sentence) do
+    [
+      "digraph {",
+      "  start[label=\"START\"]",
+      "  end[label=\"END\"]",
+      for v <- Graph.vertices(sentence.tokenizations), v != :start, v != :end do
+        "  #{vertex_id(v)}[label=\"#{v}\"]"
+      end,
+      for e <- Graph.edges(sentence.tokenizations) do
+        "  #{vertex_id(e.v1)} -> #{vertex_id(e.v2)}"
+      end,
+      "}"
+    ]
+    |> List.flatten()
+    |> Enum.intersperse("\n")
+    |> IO.chardata_to_string()
   end
+
+  defp vertex_id(:start), do: "start"
+  defp vertex_id(:end), do: "end"
+  defp vertex_id(v), do: "v#{v.index}"
 end
 
 defimpl String.Chars, for: BubbleMatch.Sentence do

diff --git a/lib/bubble_match/token.ex b/lib/bubble_match/token.ex
@@ -160,6 +160,9 @@ defmodule BubbleMatch.Token do
 end
 
 defimpl String.Chars, for: BubbleMatch.Token do
+  def to_string(%BubbleMatch.Token{type: :entity, raw: raw, value: value}),
+    do: "#{raw} [#{value.kind}]"
+
   def to_string(%BubbleMatch.Token{raw: raw}), do: raw
 end
 

diff --git a/mix.exs b/mix.exs
@@ -44,6 +44,7 @@ defmodule BubbleMatch.MixProject do
       {:nimble_parsec, "~> 0.5.3"},
       {:inflex, "~> 2.0"},
       {:jason, "~> 1.0"},
+      {:libgraph, "~> 0.13"},
       {:ex_doc, ">= 0.0.0", only: :dev},
       {:stream_data, "~> 0.1", only: :test}
     ]

diff --git a/mix.lock b/mix.lock
@@ -9,6 +9,7 @@
   "idna": {:hex, :idna, "6.0.1", "1d038fb2e7668ce41fbf681d2c45902e52b3cb9e9c77b55334353b222c2ee50c", [:rebar3], [{:unicode_util_compat, "0.5.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a02c8a1c4fd601215bb0b0324c8a6986749f807ce35f25449ec9e69758708122"},
   "inflex": {:hex, :inflex, "2.0.0", "db69d542b8fdb23ac667f9bc0c2395a3983fa2da6ae2efa7ab5dc541928f7a75", [:mix], [], "hexpm", "c018852409bd48b03ad96ed53594186bc074bdd1519043a0ad1fa5697aac4399"},
   "jason": {:hex, :jason, "1.2.0", "10043418c42d2493d0ee212d3fddd25d7ffe484380afad769a0a38795938e448", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "116747dbe057794c3a3e4e143b7c8390b29f634e16c78a7f59ba75bfa6852e7f"},
+  "libgraph": {:hex, :libgraph, "0.13.3", "20732b7bafb933dcf7351c479e03076ebd14a85fd3202c67a1c197f4f7c2466b", [:mix], [], "hexpm", "78f2576eef615440b46f10060b1de1c86640441422832052686df53dc3c148c6"},
   "makeup": {:hex, :makeup, "1.0.1", "82f332e461dc6c79dbd82fbe2a9c10d48ed07146f0a478286e590c83c52010b5", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "49736fe5b66a08d8575bf5321d716bac5da20c8e6b97714fec2bcd6febcfa1f8"},
   "makeup_elixir": {:hex, :makeup_elixir, "0.14.0", "cf8b7c66ad1cff4c14679698d532f0b5d45a3968ffbcbfd590339cb57742f1ae", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "d4b316c7222a85bbaa2fd7c6e90e37e953257ad196dc229505137c5e505e9eff"},
   "match_engine": {:hex, :match_engine, "1.4.5", "d57752c6cc799f5eca89b564a11b3ca0f35253aa4efc9cc35fd1bbb7f35eee42", [:mix], [{:simetric, "~> 0.2.0", [hex: :simetric, repo: "hexpm", optional: false]}, {:timex, "~> 3.1", [hex: :timex, repo: "hexpm", optional: false]}], "hexpm", "95356a6d3bdbf6667f6d2801a97a30dd8a95400b8c5eeb570e3a8e6c86b5373c"},

diff --git a/test/bubble_match/sentence_test.exs b/test/bubble_match/sentence_test.exs
@@ -3,22 +3,39 @@ defmodule BubbleMatch.SentenceTest do
 
   alias BubbleMatch.{Entity, Sentence}
 
+  test "tokenize" do
+    sentence =
+      Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away")
+      |> Sentence.skip_punct()
+
+    graph = Sentence.make_dot(sentence)
+    # IO.puts(graph)
+    #    view_graph(sentence)
+    assert String.contains?(graph, "start -> v0")
+    assert String.contains?(graph, "v0 -> v1")
+    assert String.contains?(graph, "v1 -> v2")
+    # punct is skipped
+    assert String.contains?(graph, "v1 -> v3")
+  end
+
   @spacy_json """
               {"ents":[{"end":27,"label":"PERSON","start":21}],"sents":[{"end":9,"start":0},{"end":27,"start":10}],"text":"Hi there. My name is George","tokens":[{"dep":"ROOT","end":2,"head":0,"id":0,"lemma":"hi","norm":"hi","pos":"INTJ","start":0,"string":"Hi ","tag":"UH"},{"dep":"advmod","end":8,"head":0,"id":1,"lemma":"there","norm":"there","pos":"ADV","start":3,"string":"there","tag":"RB"},{"dep":"punct","end":9,"head":0,"id":2,"lemma":".","norm":".","pos":"PUNCT","start":8,"string":". ","tag":"."},{"dep":"poss","end":12,"head":4,"id":3,"lemma":"-PRON-","norm":"my","pos":"DET","start":10,"string":"My ","tag":"PRP$"},{"dep":"nsubj","end":17,"head":5,"id":4,"lemma":"name","norm":"name","pos":"NOUN","start":13,"string":"name ","tag":"NN"},{"dep":"ROOT","end":20,"head":5,"id":5,"lemma":"be","norm":"is","pos":"AUX","start":18,"string":"is ","tag":"VBZ"},{"dep":"attr","end":27,"head":5,"id":6,"lemma":"George","norm":"george","pos":"PROPN","start":21,"string":"George","tag":"NNP"}]}
               """
               |> Jason.decode!()
 
   test "from_spacy" do
-    [hithere, mynameis] = Sentence.sentences_from_spacy(@spacy_json)
+    sentence = Sentence.from_spacy(@spacy_json)
 
-    assert [_, [_, _, _]] = hithere.tokenizations
+    view_graph(sentence)
+    #    System.cmd("dot", ["-Tpng", "/tmp/x.dot"])
+    # assert [_, [_, _, _]] = hithere.tokenizations
 
-    assert [with_ents, raw_tokens] = mynameis.tokenizations
+    # assert [with_ents, raw_tokens] = mynameis.tokenizations
 
-    assert ~w(my name is george) == Enum.map(raw_tokens, & &1.value["norm"])
-    assert ~w(spacy spacy spacy entity)a == Enum.map(with_ents, & &1.type)
+    # assert ~w(my name is george) == Enum.map(raw_tokens, & &1.value["norm"])
+    # assert ~w(spacy spacy spacy entity)a == Enum.map(with_ents, & &1.type)
 
-    assert [_, _, _, %{value: %Entity{value: "George"}}] = with_ents
+    # assert [_, _, _, %{value: %Entity{value: "George"}}] = with_ents
   end
 
   test "match from spacy" do
@@ -56,6 +73,9 @@ defmodule BubbleMatch.SentenceTest do
     sentence =
       Sentence.naive_tokenize("My birthday is the day after tomorrow, 10 miles away")
       |> Sentence.add_duckling_entities(@duckling_json)
+      |> Sentence.skip_punct()
+
+    view_graph(sentence)
 
     assert [with_ents, with_ents_punct | _] = sentence.tokenizations
 
@@ -110,21 +130,21 @@ defmodule BubbleMatch.SentenceTest do
                  |> Jason.decode!()
 
   @time_spacy """
-              {"text":"9 p.m.","ents":[],"sents":[{"start":0,"end":1},{"start":2,"end":6}],"tokens":[{"id":0,"start":0,"end":1,"pos":"NUM","tag":"CD","dep":"ROOT","head":0,"string":"9 ","lemma":"9","norm":"9"},{"id":1,"start":2,"end":6,"pos":"NOUN","tag":"NN","dep":"ROOT","head":1,"string":"p.m.","lemma":"p.m.","norm":"p.m."}]}
+              {"text":"9 p.m.","ents":[],"sents":[{"start":0,"end":9}],"tokens":[{"id":0,"start":0,"end":1,"pos":"NUM","tag":"CD","dep":"ROOT","head":0,"string":"9 ","lemma":"9","norm":"9"},{"id":1,"start":2,"end":6,"pos":"NOUN","tag":"NN","dep":"ROOT","head":1,"string":"p.m.","lemma":"p.m.","norm":"p.m."}]}
               """
               |> Jason.decode!()
 
   test "overlapping duckling entities" do
-    [a, b] = Sentence.sentences_from_spacy(@time_spacy)
+    Sentence.from_spacy(@time_spacy)
+    |> Sentence.add_duckling_entities(@time_duckling)
+    |> Sentence.skip_punct()
+    |> view_graph()
+  end
 
-    assert [_] = a.tokenizations
-    a = a |> Sentence.add_duckling_entities(@time_duckling)
-    assert [with_ents, _raw_tokens] = a.tokenizations
-    assert List.first(with_ents).value.kind == "time"
+  defp view_graph(sentence) do
+    graph = Sentence.make_dot(sentence)
 
-    assert [_] = b.tokenizations
-    b = b |> Sentence.add_duckling_entities(@time_duckling)
-    assert [with_ents, _raw_tokens] = b.tokenizations
-    assert List.first(with_ents).value.kind == "time"
+    File.write!("/tmp/x.dot", graph)
+    :os.cmd('dot /tmp/x.dot -Tpng > /tmp/x.png; eog /tmp/x.png')
   end
 end