From db48e08caaea26ca7a97597217b71b9af4e567a9 Mon Sep 17 00:00:00 2001
From: Arjan Scherpenisse <arjan@scherpenisse.net>
Date: Tue, 23 Jun 2020 15:55:21 +0200
Subject: [PATCH 1/4] An experiment in representing a parsed sentence as a DAG

instead of a list of tokenizations
---
 lib/bubble_match/sentence.ex        | 61 +++++++++++++++++++++++++++--
 lib/bubble_match/token.ex           | 11 ++++++
 test/bubble_match/sentence_test.exs |  5 +++
 3 files changed, 74 insertions(+), 3 deletions(-)

diff --git a/lib/bubble_match/sentence.ex b/lib/bubble_match/sentence.ex
index 54c7a81..7604a99 100644
--- a/lib/bubble_match/sentence.ex
+++ b/lib/bubble_match/sentence.ex
@@ -35,13 +35,13 @@ defmodule BubbleMatch.Sentence do
   def naive_tokenize(input)
 
   def naive_tokenize("") do
-    %M{text: "", tokenizations: [[]]}
+    %M{text: "", tokenizations: :digraph.new()}
   end
 
   def naive_tokenize(input) when is_binary(input) do
     tokens = Tokenizer.tokenize(input)
-    no_punct = Tokenizer.strip_punct(tokens)
-    %M{text: input, tokenizations: both_if_different(no_punct, tokens)}
+    graph = build_token_graph(tokens)
+    %M{text: input, tokenizations: graph}
   end
 
   @doc """
@@ -164,6 +164,61 @@ defmodule BubbleMatch.Sentence do
 
   defp both_if_different(a, a), do: [a]
   defp both_if_different(a, b), do: [a, b]
+
+  defp build_token_graph(tokens) do
+    graph = :digraph.new([:acyclic])
+    :digraph.add_vertex(graph, :start)
+    :digraph.add_vertex(graph, :end)
+    build_token_graph(tokens, :start, graph)
+  end
+
+  defp build_token_graph([], _prev, graph) do
+    graph
+  end
+
+  defp build_token_graph([last], prev, graph) do
+    :digraph.add_vertex(graph, last)
+    :digraph.add_vertex(graph, prev)
+    :digraph.add_vertex(graph, :end)
+    :digraph.add_edge(graph, prev, last)
+    :digraph.add_edge(graph, last, :end)
+    graph
+  end
+
+  defp build_token_graph([a, b | rest], prev, graph) do
+    :digraph.add_vertex(graph, a)
+    :digraph.add_vertex(graph, b)
+    :digraph.add_edge(graph, prev, a)
+
+    if Token.punct?(a) do
+      :digraph.add_edge(graph, prev, b)
+    end
+
+    build_token_graph([b | rest], a, graph)
+  end
+
+  def print_dot(sentence) do
+    IO.puts("digraph {")
+
+    IO.puts("  start[label=\"START\"]")
+    IO.puts("  end[label=\"END\"]")
+
+    for v <- :digraph.vertices(sentence.tokenizations), v != :start, v != :end do
+      IO.puts("  #{vertex_id(v)}[label=\"#{v.value}\"]")
+    end
+
+    for e <- :digraph.edges(sentence.tokenizations) do
+      {_, from, to, _} = :digraph.edge(sentence.tokenizations, e)
+
+      IO.puts("  #{vertex_id(from)} -> #{vertex_id(to)}")
+    end
+
+    IO.puts("}")
+  end
+
+  defp vertex_id(:start), do: "start"
+  defp vertex_id(:end), do: "end"
+  defp vertex_id(v), do: "v#{v.index}"
 end
 
 defimpl String.Chars, for: BubbleMatch.Sentence do
diff --git a/lib/bubble_match/token.ex b/lib/bubble_match/token.ex
index e6768b9..e01922b 100644
--- a/lib/bubble_match/token.ex
+++ b/lib/bubble_match/token.ex
@@ -65,6 +65,17 @@ defmodule BubbleMatch.Token do
     }
   end
 
+  @doc """
+  Test whether a token is punctuation
+  """
+  def punct?(%M{type: :punct}) do
+    true
+  end
+
+  def punct?(token) do
+    pos?(token, "PUNCT")
+  end
+
   @doc """
   Test whether a token mathces the given POS (part-of-speech) tag.
   """
diff --git a/test/bubble_match/sentence_test.exs b/test/bubble_match/sentence_test.exs
index a2b990c..702b17e 100644
--- a/test/bubble_match/sentence_test.exs
+++ b/test/bubble_match/sentence_test.exs
@@ -3,6 +3,11 @@ defmodule BubbleMatch.SentenceTest do
 
   alias BubbleMatch.{Entity, Sentence}
 
+  test "tokenize" do
+    sentence = Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away")
+    Sentence.print_dot(sentence)
+  end
+
   @spacy_json """
               {"ents":[{"end":27,"label":"PERSON","start":21}],"sents":[{"end":9,"start":0},{"end":27,"start":10}],"text":"Hi there. My name is George","tokens":[{"dep":"ROOT","end":2,"head":0,"id":0,"lemma":"hi","norm":"hi","pos":"INTJ","start":0,"string":"Hi ","tag":"UH"},{"dep":"advmod","end":8,"head":0,"id":1,"lemma":"there","norm":"there","pos":"ADV","start":3,"string":"there","tag":"RB"},{"dep":"punct","end":9,"head":0,"id":2,"lemma":".","norm":".","pos":"PUNCT","start":8,"string":". ","tag":"."},{"dep":"poss","end":12,"head":4,"id":3,"lemma":"-PRON-","norm":"my","pos":"DET","start":10,"string":"My ","tag":"PRP$"},{"dep":"nsubj","end":17,"head":5,"id":4,"lemma":"name","norm":"name","pos":"NOUN","start":13,"string":"name ","tag":"NN"},{"dep":"ROOT","end":20,"head":5,"id":5,"lemma":"be","norm":"is","pos":"AUX","start":18,"string":"is ","tag":"VBZ"},{"dep":"attr","end":27,"head":5,"id":6,"lemma":"George","norm":"george","pos":"PROPN","start":21,"string":"George","tag":"NNP"}]}
               """

From 37217bc047575d6cc96f378551691c18ce91ddf9 Mon Sep 17 00:00:00 2001
From: Arjan Scherpenisse <arjan@scherpenisse.net>
Date: Mon, 29 Jun 2020 09:36:05 +0200
Subject: [PATCH 2/4] Skip all punct tokens in one path

---
 lib/bubble_match/sentence.ex        | 33 ++++++++++++++++++++++++++---
 test/bubble_match/sentence_test.exs |  2 +-
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/lib/bubble_match/sentence.ex b/lib/bubble_match/sentence.ex
index 7604a99..9918b04 100644
--- a/lib/bubble_match/sentence.ex
+++ b/lib/bubble_match/sentence.ex
@@ -190,11 +190,38 @@ defmodule BubbleMatch.Sentence do
     :digraph.add_vertex(graph, b)
     :digraph.add_edge(graph, prev, a)
 
-    if Token.punct?(a) do
-      :digraph.add_edge(graph, prev, b)
+    case Token.punct?(a) do
+      true ->
+        case eat_punct([b | rest], a, graph) do
+          [nonpunct | rest] ->
+            :digraph.add_vertex(graph, nonpunct)
+            :digraph.add_edge(graph, prev, nonpunct)
+            build_token_graph(rest, nonpunct, graph)
+
+          [] ->
+            graph
+        end
+
+      false ->
+        build_token_graph([b | rest], a, graph)
     end
+  end
+
+  defp eat_punct([], _prev, _graph) do
+    []
+  end
 
-    build_token_graph([b | rest], a, graph)
+  defp eat_punct([t | rest], prev, graph) do
+    :digraph.add_vertex(graph, t)
+    :digraph.add_edge(graph, prev, t)
+
+    case Token.punct?(t) do
+      true ->
+        eat_punct(rest, t, graph)
+
+      false ->
+        [t | rest]
+    end
   end
 
   def print_dot(sentence) do
diff --git a/test/bubble_match/sentence_test.exs b/test/bubble_match/sentence_test.exs
index 702b17e..001d2ec 100644
--- a/test/bubble_match/sentence_test.exs
+++ b/test/bubble_match/sentence_test.exs
@@ -4,7 +4,7 @@ defmodule BubbleMatch.SentenceTest do
   alias BubbleMatch.{Entity, Sentence}
 
   test "tokenize" do
-    sentence = Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away")
+    sentence = Sentence.naive_tokenize("My birthday,,, is the day after tomorrow, 10 miles away")
     Sentence.print_dot(sentence)
   end
 

From 1f9f95d733a847947a5ca59a804aeae910d49cf9 Mon Sep 17 00:00:00 2001
From: Arjan Scherpenisse <arjan@scherpenisse.net>
Date: Mon, 29 Jun 2020 21:45:14 +0200
Subject: [PATCH 3/4] Implement digraph for Spacy and Duckling

---
 lib/bubble_match/sentence.ex        | 207 +++++++++-------------------
 lib/bubble_match/token.ex           |   3 +
 test/bubble_match/sentence_test.exs |  33 +++--
 3 files changed, 95 insertions(+), 148 deletions(-)

diff --git a/lib/bubble_match/sentence.ex b/lib/bubble_match/sentence.ex
index 0b41060..b97ff4a 100644
--- a/lib/bubble_match/sentence.ex
+++ b/lib/bubble_match/sentence.ex
@@ -40,49 +40,38 @@ defmodule BubbleMatch.Sentence do
 
   def naive_tokenize(input) when is_binary(input) do
     tokens = Tokenizer.tokenize(input)
-    graph = build_token_graph(tokens)
+    graph = new_graph() |> build_token_graph(tokens)
     %M{text: input, tokenizations: graph}
   end
 
   @doc """
-  Convert a JSON blob from Spacy NLP data into a list of sentences
+  Convert a JSON blob from Spacy NLP data into a sentence.
 
   This function takes the output of Spacy's [Doc.to_json][spacy]
-  function and converts it into a list of sentences.
+  function and creates a Sentence struct from it. Note that the struct
+  might actually contain more than one sentence.
 
   [spacy]: https://spacy.io/api/doc#to_json
   """
-  @spec sentences_from_spacy(spacy_json :: map()) :: [t()]
-  def sentences_from_spacy(spacy_json) do
-    spacy_sentences_split(spacy_json["sents"], spacy_json, [])
-    |> Enum.map(fn {text, tokens, entities} ->
-      %M{text: text, tokenizations: both_if_different(no_punct(tokens), tokens)}
-      |> add_spacy_entities(entities, spacy_json)
-    end)
-  end
-
-  defp spacy_sentences_split([], _result, acc) do
-    Enum.reverse(acc)
-  end
+  @spec from_spacy(spacy_json :: map()) :: [t()]
+  def from_spacy(spacy_json) do
+    text = spacy_json["text"]
 
-  defp spacy_sentences_split([%{"start" => start, "end" => end_} | rest], result, acc) do
-    s_text = String.slice(result["text"], start, end_ - start)
-
-    s_tokens =
-      result["tokens"]
-      |> Enum.filter(&(&1["start"] >= start && &1["end"] <= end_))
+    tokens =
+      spacy_json["tokens"]
       |> Enum.map(&Token.from_spacy/1)
-      |> reindex()
 
-    s_ents = result["ents"] |> Enum.filter(&(&1["start"] >= start && &1["end"] <= end_))
-    spacy_sentences_split(rest, result, [{s_text, s_tokens, s_ents} | acc])
-  end
+    graph = new_graph()
 
-  defp add_spacy_entities(%M{} = m, [], _), do: m
+    for %{"start" => start, "end" => end_} <- spacy_json["sents"] do
+      ts = Enum.filter(tokens, &(&1.start >= start && &1.end <= end_))
+      build_token_graph(graph, ts)
+    end
 
-  defp add_spacy_entities(%M{} = m, ents, %{"text" => text}) do
-    sequences = Enum.map(ents, &[Token.from_spacy_entity(&1, text)])
-    add_tokenization(m, sequences)
+    # add entities
+    ents = Enum.map(spacy_json["ents"], &Token.from_spacy_entity(&1, text))
+    add_entities(graph, ents)
+    %M{text: text, tokenizations: graph}
   end
 
   @doc """
@@ -98,70 +87,22 @@ defmodule BubbleMatch.Sentence do
   def add_duckling_entities(%M{} = sentence, []), do: sentence
 
   def add_duckling_entities(%M{} = sentence, entities) do
-    sequences = Enum.map(entities, &[Token.from_duckling_entity(&1)])
-    add_tokenization(sentence, sequences)
-  end
-
-  @doc false
-  def add_tokenization(%M{} = m, replace_token_sequences) do
-    raw_tokens = List.last(m.tokenizations)
-
-    tokenization =
-      replace_token_sequences
-      |> Enum.reduce(raw_tokens, fn seq, toks ->
-        replace_tokens(toks, seq)
-      end)
+    ents = Enum.map(entities, &Token.from_duckling_entity(&1))
 
-    tokenizations = both_if_different(no_punct(tokenization), tokenization)
-    %M{m | tokenizations: Enum.uniq(tokenizations ++ m.tokenizations)}
+    add_entities(sentence.tokenizations, ents)
+    sentence
   end
 
-  defp replace_tokens(token_sequence, replace_tokens) do
-    # find
-    start = List.first(replace_tokens).start
-    end_ = List.last(replace_tokens).end
-
-    start_idx = Enum.find_index(token_sequence, &(&1.start == start))
-    end_idx = Enum.find_index(token_sequence, &(&1.end == end_))
-
-    cond do
-      start_idx != nil and end_idx != nil and end_idx >= start_idx ->
-        {a, _} = Enum.split(token_sequence, start_idx)
-        {_, b} = Enum.split(token_sequence, end_idx + 1)
-
-        (a ++ replace_tokens ++ b)
-        |> reindex()
-
-      start_idx != nil and end_idx == nil ->
-        {a, _} = Enum.split(token_sequence, start_idx)
-
-        (a ++ replace_tokens)
-        |> reindex()
-
-      start_idx == nil and end_idx != nil ->
-        {_, b} = Enum.split(token_sequence, end_idx + 1)
-
-        (replace_tokens ++ b)
-        |> reindex()
-
-      true ->
-        # raise RuntimeError, "Token not found at start = #{start}, end = #{end_}"
-        token_sequence
-    end
-  end
-
-  defp reindex(tokens) do
-    tokens
-    |> Enum.with_index()
-    |> Enum.map(fn {t, index} ->
-      %{t | index: index}
-    end)
-  end
+  ###
 
-  defp build_token_graph(tokens) do
+  defp new_graph() do
     graph = :digraph.new([:acyclic])
     :digraph.add_vertex(graph, :start)
     :digraph.add_vertex(graph, :end)
+    graph
+  end
+
+  defp build_token_graph(graph, tokens) do
     build_token_graph(tokens, :start, graph)
   end
 
@@ -175,11 +116,6 @@ defmodule BubbleMatch.Sentence do
     :digraph.add_vertex(graph, :end)
     :digraph.add_edge(graph, prev, last)
     :digraph.add_edge(graph, last, :end)
-
-    if Token.punct?(last) do
-      :digraph.add_edge(graph, prev, :end)
-    end
-
     graph
   end
 
@@ -187,70 +123,61 @@ defmodule BubbleMatch.Sentence do
     :digraph.add_vertex(graph, a)
     :digraph.add_vertex(graph, b)
     :digraph.add_edge(graph, prev, a)
-
-    case Token.punct?(a) do
-      true ->
-        case eat_punct([b | rest], a, graph) do
-          [nonpunct | rest] ->
-            :digraph.add_vertex(graph, nonpunct)
-            :digraph.add_edge(graph, prev, nonpunct)
-            build_token_graph(rest, nonpunct, graph)
-
-          [] ->
-            graph
-        end
-
-      false ->
-        build_token_graph([b | rest], a, graph)
-    end
+    build_token_graph([b | rest], a, graph)
   end
 
-  defp eat_punct([], _prev, _graph) do
-    []
-  end
-
-  defp eat_punct([t | rest], prev, graph) do
-    :digraph.add_vertex(graph, t)
-    :digraph.add_edge(graph, prev, t)
-
-    case Token.punct?(t) do
-      true ->
-        eat_punct(rest, t, graph)
+  defp add_entities(graph, ents) do
+    for %{start: start, end: end_} = ent <- ents do
+      ent = Map.put(ent, :index, :erlang.system_time())
+      :digraph.add_vertex(graph, ent)
 
-      false ->
-        [t | rest]
-    end
-  end
+      t_start =
+        :digraph.vertices(graph)
+        |> Enum.find(&(is_map(&1) && &1.end == start - 1))
 
-  def print_dot(sentence) do
-    IO.puts("digraph {")
+      if t_start do
+        :digraph.add_edge(graph, t_start, ent)
+      else
+        :digraph.add_edge(graph, :start, ent)
+      end
 
-    IO.puts("  start[label=\"START\"]")
-    IO.puts("  end[label=\"END\"]")
+      t_end =
+        :digraph.vertices(graph)
+        |> Enum.find(&(is_map(&1) && (&1.start == end_ + 1 || &1.start == end_)))
 
-    for v <- :digraph.vertices(sentence.tokenizations), v != :start, v != :end do
-      IO.puts("  #{vertex_id(v)}[label=\"#{v.value}\"]")
+      if t_end do
+        :digraph.add_edge(graph, ent, t_end)
+      else
+        :digraph.add_edge(graph, ent, :end)
+      end
     end
 
-    for e <- :digraph.edges(sentence.tokenizations) do
-      {_, from, to, _} = :digraph.edge(sentence.tokenizations, e)
+    graph
+  end
 
-      IO.puts("  #{vertex_id(from)} -> #{vertex_id(to)}")
-    end
+  def make_dot(sentence) do
+    [
+      "digraph {",
+      "  start[label=\"START\"]",
+      "  end[label=\"END\"]",
+      for v <- :digraph.vertices(sentence.tokenizations), v != :start, v != :end do
+        "  #{vertex_id(v)}[label=\"#{v}\"]"
+      end,
+      for e <- :digraph.edges(sentence.tokenizations) do
+        {_, from, to, _} = :digraph.edge(sentence.tokenizations, e)
 
-    IO.puts("}")
+        "  #{vertex_id(from)} -> #{vertex_id(to)}"
+      end,
+      "}"
+    ]
+    |> List.flatten()
+    |> Enum.intersperse("\n")
+    |> IO.chardata_to_string()
   end
 
   defp vertex_id(:start), do: "start"
   defp vertex_id(:end), do: "end"
   defp vertex_id(v), do: "v#{v.index}"
-  defp both_if_different(a, b, rest \\ [])
-  defp both_if_different(a, a, rest), do: [a | rest]
-  defp both_if_different(a, b, rest), do: [a, b | rest]
-
-  defp no_punct(tokens) do
-    tokens |> Enum.reject(&Token.punct?/1)
-  end
 end
 
 defimpl String.Chars, for: BubbleMatch.Sentence do
diff --git a/lib/bubble_match/token.ex b/lib/bubble_match/token.ex
index baf446f..c1fa068 100644
--- a/lib/bubble_match/token.ex
+++ b/lib/bubble_match/token.ex
@@ -160,6 +160,9 @@ defmodule BubbleMatch.Token do
 end
 
 defimpl String.Chars, for: BubbleMatch.Token do
+  def to_string(%BubbleMatch.Token{type: :entity, raw: raw, value: value}),
+    do: "#{raw} [#{value.kind}]"
+
   def to_string(%BubbleMatch.Token{raw: raw}), do: raw
 end
 
diff --git a/test/bubble_match/sentence_test.exs b/test/bubble_match/sentence_test.exs
index df1cebf..82a8673 100644
--- a/test/bubble_match/sentence_test.exs
+++ b/test/bubble_match/sentence_test.exs
@@ -4,8 +4,14 @@ defmodule BubbleMatch.SentenceTest do
   alias BubbleMatch.{Entity, Sentence}
 
   test "tokenize" do
-    sentence = Sentence.naive_tokenize("My birthday,,, is the day after tomorrow, 10 miles away")
-    Sentence.print_dot(sentence)
+    sentence = Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away")
+
+    graph = Sentence.make_dot(sentence)
+    assert String.contains?(graph, "start -> v0")
+    assert String.contains?(graph, "v0 -> v1")
+    assert String.contains?(graph, "v1 -> v2")
+    # punct is skipped
+    assert String.contains?(graph, "v1 -> v3")
   end
 
   @spacy_json """
@@ -14,16 +20,18 @@ defmodule BubbleMatch.SentenceTest do
               |> Jason.decode!()
 
   test "from_spacy" do
-    [hithere, mynameis] = Sentence.sentences_from_spacy(@spacy_json)
+    sentence = Sentence.from_spacy(@spacy_json)
 
-    assert [_, [_, _, _]] = hithere.tokenizations
+    view_graph(sentence)
+    #    System.cmd("dot", ["-Tpng", "/tmp/x.dot"])
+    # assert [_, [_, _, _]] = hithere.tokenizations
 
-    assert [with_ents, raw_tokens] = mynameis.tokenizations
+    # assert [with_ents, raw_tokens] = mynameis.tokenizations
 
-    assert ~w(my name is george) == Enum.map(raw_tokens, & &1.value["norm"])
-    assert ~w(spacy spacy spacy entity)a == Enum.map(with_ents, & &1.type)
+    # assert ~w(my name is george) == Enum.map(raw_tokens, & &1.value["norm"])
+    # assert ~w(spacy spacy spacy entity)a == Enum.map(with_ents, & &1.type)
 
-    assert [_, _, _, %{value: %Entity{value: "George"}}] = with_ents
+    # assert [_, _, _, %{value: %Entity{value: "George"}}] = with_ents
   end
 
   test "match from spacy" do
@@ -62,6 +70,8 @@ defmodule BubbleMatch.SentenceTest do
       Sentence.naive_tokenize("My birthday is the day after tomorrow, 10 miles away")
       |> Sentence.add_duckling_entities(@duckling_json)
 
+    view_graph(sentence)
+
     assert [with_ents, with_ents_punct | _] = sentence.tokenizations
 
     assert [
@@ -132,4 +142,11 @@ defmodule BubbleMatch.SentenceTest do
     assert [with_ents, _raw_tokens] = b.tokenizations
     assert List.first(with_ents).value.kind == "time"
   end
+
+  defp view_graph(sentence) do
+    graph = Sentence.make_dot(sentence)
+
+    File.write!("/tmp/x.dot", graph)
+    :os.cmd('dot /tmp/x.dot -Tpng > /tmp/x.png; eog /tmp/x.png')
+  end
 end

From a99b3098b964f3fa6a086816ea3aeb48c1a08d70 Mon Sep 17 00:00:00 2001
From: Arjan Scherpenisse <arjan@scherpenisse.net>
Date: Thu, 16 Jul 2020 21:59:42 +0200
Subject: [PATCH 4/4] Switch to libgraph

---
 lib/bubble_match/sentence.ex        | 157 ++++++++++++++++++----------
 mix.exs                             |   1 +
 mix.lock                            |   1 +
 test/bubble_match/sentence_test.exs |  24 ++---
 4 files changed, 115 insertions(+), 68 deletions(-)

diff --git a/lib/bubble_match/sentence.ex b/lib/bubble_match/sentence.ex
index b97ff4a..c803e01 100644
--- a/lib/bubble_match/sentence.ex
+++ b/lib/bubble_match/sentence.ex
@@ -35,7 +35,7 @@ defmodule BubbleMatch.Sentence do
   def naive_tokenize(input)
 
   def naive_tokenize("") do
-    %M{text: "", tokenizations: :digraph.new()}
+    %M{text: "", tokenizations: new_graph()}
   end
 
   def naive_tokenize(input) when is_binary(input) do
@@ -62,15 +62,27 @@ defmodule BubbleMatch.Sentence do
       |> Enum.map(&Token.from_spacy/1)
 
     graph = new_graph()
+    sents = spacy_json["sents"]
 
-    for %{"start" => start, "end" => end_} <- spacy_json["sents"] do
-      ts = Enum.filter(tokens, &(&1.start >= start && &1.end <= end_))
-      build_token_graph(graph, ts)
-    end
+    # add all sentences
+    graph =
+      Enum.reduce(sents, graph, fn %{"start" => start, "end" => end_}, graph ->
+        ts = Enum.filter(tokens, &(&1.start >= start && &1.end <= end_))
+        build_token_graph(graph, ts)
+      end)
+
+    [_ | pairs] = Enum.zip([nil | sents], sents)
+
+    # add edge between sentences
+    graph =
+      Enum.reduce(pairs, graph, fn {%{"end" => end_}, %{"start" => start}}, graph ->
+        {t_start, t_end} = find_start_end(graph, start, end_)
+        Graph.add_edge(graph, t_start, t_end)
+      end)
 
     # add entities
     ents = Enum.map(spacy_json["ents"], &Token.from_spacy_entity(&1, text))
-    add_entities(graph, ents)
+    graph = add_entities(graph, ents)
     %M{text: text, tokenizations: graph}
   end
 
@@ -89,70 +101,107 @@ defmodule BubbleMatch.Sentence do
   def add_duckling_entities(%M{} = sentence, entities) do
     ents = Enum.map(entities, &Token.from_duckling_entity(&1))
 
-    add_entities(sentence.tokenizations, ents)
-    sentence
+    graph = add_entities(sentence.tokenizations, ents)
+    %M{sentence | tokenizations: graph}
+  end
+
+  def skip_punct(%M{tokenizations: graph} = m) do
+    graph =
+      Enum.reduce(Graph.vertices(graph), graph, fn v, graph ->
+        connect_punct(graph, v, nil)
+      end)
+
+    %{m | tokenizations: graph}
   end
 
   ###
 
+  defp connect_punct(graph, v, first) do
+    case out_vertices(graph, v) |> Enum.split_with(&Token.punct?/1) do
+      {[], []} ->
+        graph
+
+      {p, []} ->
+        Enum.reduce(p, graph, fn v2, graph ->
+          connect_punct(graph, v2, first || v)
+        end)
+
+      {_, vs} ->
+        if first && not Token.punct?(first) do
+          Enum.reduce(vs, graph, fn v2, graph ->
+            Graph.add_edge(graph, first, v2)
+          end)
+        else
+          graph
+        end
+    end
+  end
+
   defp new_graph() do
-    graph = :digraph.new([:acyclic])
-    :digraph.add_vertex(graph, :start)
-    :digraph.add_vertex(graph, :end)
-    graph
+    Graph.new(type: :directed)
+    |> Graph.add_vertices([:start, :end])
   end
 
   defp build_token_graph(graph, tokens) do
-    build_token_graph(tokens, :start, graph)
+    build_token_graph(graph, tokens, :start)
   end
 
-  defp build_token_graph([], _prev, graph) do
+  defp build_token_graph(graph, [], _prev) do
     graph
   end
 
-  defp build_token_graph([last], prev, graph) do
-    :digraph.add_vertex(graph, last)
-    :digraph.add_vertex(graph, prev)
-    :digraph.add_vertex(graph, :end)
-    :digraph.add_edge(graph, prev, last)
-    :digraph.add_edge(graph, last, :end)
+  defp build_token_graph(graph, [last], prev) do
     graph
+    |> Graph.add_vertices([last, prev, :end])
+    |> Graph.add_edge(prev, last)
+    |> Graph.add_edge(last, :end)
   end
 
-  defp build_token_graph([a, b | rest], prev, graph) do
-    :digraph.add_vertex(graph, a)
-    :digraph.add_vertex(graph, b)
-    :digraph.add_edge(graph, prev, a)
-    build_token_graph([b | rest], a, graph)
+  defp build_token_graph(graph, [a, b | rest], prev) do
+    graph
+    |> Graph.add_vertices([a, b])
+    |> Graph.add_edge(prev, a)
+    |> build_token_graph([b | rest], a)
+  end
+
+  defp find_start_end(graph, start, end_) do
+    t_start =
+      Graph.vertices(graph)
+      |> Enum.find(&(is_map(&1) && &1.end == start - 1))
+
+    t_end =
+      Graph.vertices(graph)
+      |> Enum.find(&(is_map(&1) && (&1.start == end_ + 1 || &1.start == end_)))
+
+    {t_start, t_end}
   end
 
   defp add_entities(graph, ents) do
-    for %{start: start, end: end_} = ent <- ents do
-      ent = Map.put(ent, :index, :erlang.system_time())
-      :digraph.add_vertex(graph, ent)
-
-      t_start =
-        :digraph.vertices(graph)
-        |> Enum.find(&(is_map(&1) && &1.end == start - 1))
-
-      if t_start do
-        :digraph.add_edge(graph, t_start, ent)
-      else
-        :digraph.add_edge(graph, :start, ent)
-      end
-
-      t_end =
-        :digraph.vertices(graph)
-        |> Enum.find(&(is_map(&1) && (&1.start == end_ + 1 || &1.start == end_)))
-
-      if t_end do
-        :digraph.add_edge(graph, ent, t_end)
-      else
-        :digraph.add_edge(graph, ent, :end)
-      end
-    end
+    Enum.reduce(ents, graph, fn %{start: start, end: end_} = ent, graph ->
+      {t_start, t_end} = find_start_end(graph, start, end_)
+      graph = Graph.add_vertex(graph, ent)
+
+      graph =
+        if t_start do
+          Graph.add_edge(graph, t_start, ent)
+        else
+          Graph.add_edge(graph, :start, ent)
+        end
+
+      graph =
+        if t_end do
+          Graph.add_edge(graph, ent, t_end)
+        else
+          Graph.add_edge(graph, ent, :end)
+        end
+
+      graph
+    end)
+  end
 
-    graph
+  def out_vertices(graph, vertex) do
+    Graph.out_edges(graph, vertex)
+    |> Enum.map(fn e -> e.v2 end)
   end
 
   def make_dot(sentence) do
@@ -160,13 +209,11 @@ defmodule BubbleMatch.Sentence do
       "digraph {",
       "  start[label=\"START\"]",
       "  end[label=\"END\"]",
-      for v <- :digraph.vertices(sentence.tokenizations), v != :start, v != :end do
+      for v <- Graph.vertices(sentence.tokenizations), v != :start, v != :end do
         "  #{vertex_id(v)}[label=\"#{v}\"]"
       end,
-      for e <- :digraph.edges(sentence.tokenizations) do
-        {_, from, to, _} = :digraph.edge(sentence.tokenizations, e)
-
-        "  #{vertex_id(from)} -> #{vertex_id(to)}"
+      for e <- Graph.edges(sentence.tokenizations) do
+        "  #{vertex_id(e.v1)} -> #{vertex_id(e.v2)}"
       end,
       "}"
     ]
diff --git a/mix.exs b/mix.exs
index ed0485c..6ed87a3 100644
--- a/mix.exs
+++ b/mix.exs
@@ -44,6 +44,7 @@ defmodule BubbleMatch.MixProject do
       {:nimble_parsec, "~> 0.5.3"},
       {:inflex, "~> 2.0"},
       {:jason, "~> 1.0"},
+      {:libgraph, "~> 0.13"},
       {:ex_doc, ">= 0.0.0", only: :dev},
       {:stream_data, "~> 0.1", only: :test}
     ]
diff --git a/mix.lock b/mix.lock
index 3522b15..9ac1ee3 100644
--- a/mix.lock
+++ b/mix.lock
@@ -9,6 +9,7 @@
   "idna": {:hex, :idna, "6.0.1", "1d038fb2e7668ce41fbf681d2c45902e52b3cb9e9c77b55334353b222c2ee50c", [:rebar3], [{:unicode_util_compat, "0.5.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a02c8a1c4fd601215bb0b0324c8a6986749f807ce35f25449ec9e69758708122"},
   "inflex": {:hex, :inflex, "2.0.0", "db69d542b8fdb23ac667f9bc0c2395a3983fa2da6ae2efa7ab5dc541928f7a75", [:mix], [], "hexpm", "c018852409bd48b03ad96ed53594186bc074bdd1519043a0ad1fa5697aac4399"},
   "jason": {:hex, :jason, "1.2.0", "10043418c42d2493d0ee212d3fddd25d7ffe484380afad769a0a38795938e448", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "116747dbe057794c3a3e4e143b7c8390b29f634e16c78a7f59ba75bfa6852e7f"},
+  "libgraph": {:hex, :libgraph, "0.13.3", "20732b7bafb933dcf7351c479e03076ebd14a85fd3202c67a1c197f4f7c2466b", [:mix], [], "hexpm", "78f2576eef615440b46f10060b1de1c86640441422832052686df53dc3c148c6"},
   "makeup": {:hex, :makeup, "1.0.1", "82f332e461dc6c79dbd82fbe2a9c10d48ed07146f0a478286e590c83c52010b5", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "49736fe5b66a08d8575bf5321d716bac5da20c8e6b97714fec2bcd6febcfa1f8"},
   "makeup_elixir": {:hex, :makeup_elixir, "0.14.0", "cf8b7c66ad1cff4c14679698d532f0b5d45a3968ffbcbfd590339cb57742f1ae", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "d4b316c7222a85bbaa2fd7c6e90e37e953257ad196dc229505137c5e505e9eff"},
   "match_engine": {:hex, :match_engine, "1.4.5", "d57752c6cc799f5eca89b564a11b3ca0f35253aa4efc9cc35fd1bbb7f35eee42", [:mix], [{:simetric, "~> 0.2.0", [hex: :simetric, repo: "hexpm", optional: false]}, {:timex, "~> 3.1", [hex: :timex, repo: "hexpm", optional: false]}], "hexpm", "95356a6d3bdbf6667f6d2801a97a30dd8a95400b8c5eeb570e3a8e6c86b5373c"},
diff --git a/test/bubble_match/sentence_test.exs b/test/bubble_match/sentence_test.exs
index 82a8673..73606f3 100644
--- a/test/bubble_match/sentence_test.exs
+++ b/test/bubble_match/sentence_test.exs
@@ -4,9 +4,13 @@ defmodule BubbleMatch.SentenceTest do
   alias BubbleMatch.{Entity, Sentence}
 
   test "tokenize" do
-    sentence = Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away")
+    sentence =
+      Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away")
+      |> Sentence.skip_punct()
 
     graph = Sentence.make_dot(sentence)
+    # IO.puts(graph)
+    #    view_graph(sentence)
     assert String.contains?(graph, "start -> v0")
     assert String.contains?(graph, "v0 -> v1")
     assert String.contains?(graph, "v1 -> v2")
@@ -69,6 +73,7 @@ defmodule BubbleMatch.SentenceTest do
     sentence =
       Sentence.naive_tokenize("My birthday is the day after tomorrow, 10 miles away")
       |> Sentence.add_duckling_entities(@duckling_json)
+      |> Sentence.skip_punct()
 
     view_graph(sentence)
 
@@ -125,22 +130,15 @@ defmodule BubbleMatch.SentenceTest do
                  |> Jason.decode!()
 
   @time_spacy """
-              {"text":"9 p.m.","ents":[],"sents":[{"start":0,"end":1},{"start":2,"end":6}],"tokens":[{"id":0,"start":0,"end":1,"pos":"NUM","tag":"CD","dep":"ROOT","head":0,"string":"9 ","lemma":"9","norm":"9"},{"id":1,"start":2,"end":6,"pos":"NOUN","tag":"NN","dep":"ROOT","head":1,"string":"p.m.","lemma":"p.m.","norm":"p.m."}]}
+              {"text":"9 p.m.","ents":[],"sents":[{"start":0,"end":9}],"tokens":[{"id":0,"start":0,"end":1,"pos":"NUM","tag":"CD","dep":"ROOT","head":0,"string":"9 ","lemma":"9","norm":"9"},{"id":1,"start":2,"end":6,"pos":"NOUN","tag":"NN","dep":"ROOT","head":1,"string":"p.m.","lemma":"p.m.","norm":"p.m."}]}
               """
               |> Jason.decode!()
 
   test "overlapping duckling entities" do
-    [a, b] = Sentence.sentences_from_spacy(@time_spacy)
-
-    assert [_] = a.tokenizations
-    a = a |> Sentence.add_duckling_entities(@time_duckling)
-    assert [with_ents, _raw_tokens] = a.tokenizations
-    assert List.first(with_ents).value.kind == "time"
-
-    assert [_] = b.tokenizations
-    b = b |> Sentence.add_duckling_entities(@time_duckling)
-    assert [with_ents, _raw_tokens] = b.tokenizations
-    assert List.first(with_ents).value.kind == "time"
+    Sentence.from_spacy(@time_spacy)
+    |> Sentence.add_duckling_entities(@time_duckling)
+    |> Sentence.skip_punct()
+    |> view_graph()
   end
 
   defp view_graph(sentence) do