From db48e08caaea26ca7a97597217b71b9af4e567a9 Mon Sep 17 00:00:00 2001 From: Arjan Scherpenisse Date: Tue, 23 Jun 2020 15:55:21 +0200 Subject: [PATCH 1/4] An experiment in representing a parsed sentence as a DAG instead of a list of tokenizations --- lib/bubble_match/sentence.ex | 61 +++++++++++++++++++++++++++-- lib/bubble_match/token.ex | 11 ++++++ test/bubble_match/sentence_test.exs | 5 +++ 3 files changed, 74 insertions(+), 3 deletions(-) diff --git a/lib/bubble_match/sentence.ex b/lib/bubble_match/sentence.ex index 54c7a81..7604a99 100644 --- a/lib/bubble_match/sentence.ex +++ b/lib/bubble_match/sentence.ex @@ -35,13 +35,13 @@ defmodule BubbleMatch.Sentence do def naive_tokenize(input) def naive_tokenize("") do - %M{text: "", tokenizations: [[]]} + %M{text: "", tokenizations: :digraph.new()} end def naive_tokenize(input) when is_binary(input) do tokens = Tokenizer.tokenize(input) - no_punct = Tokenizer.strip_punct(tokens) - %M{text: input, tokenizations: both_if_different(no_punct, tokens)} + graph = build_token_graph(tokens) + %M{text: input, tokenizations: graph} end @doc """ @@ -164,6 +164,61 @@ defmodule BubbleMatch.Sentence do defp both_if_different(a, a), do: [a] defp both_if_different(a, b), do: [a, b] + + defp build_token_graph(tokens) do + graph = :digraph.new([:acyclic]) + :digraph.add_vertex(graph, :start) + :digraph.add_vertex(graph, :end) + build_token_graph(tokens, :start, graph) + end + + defp build_token_graph([], _prev, graph) do + graph + end + + defp build_token_graph([last], prev, graph) do + :digraph.add_vertex(graph, last) + :digraph.add_vertex(graph, prev) + :digraph.add_vertex(graph, :end) + :digraph.add_edge(graph, prev, last) + :digraph.add_edge(graph, last, :end) + graph + end + + defp build_token_graph([a, b | rest], prev, graph) do + :digraph.add_vertex(graph, a) + :digraph.add_vertex(graph, b) + :digraph.add_edge(graph, prev, a) + + if Token.punct?(a) do + :digraph.add_edge(graph, prev, b) + end + + build_token_graph([b | rest], a, graph) + end + + def print_dot(sentence) do + IO.puts("digraph {") + + IO.puts(" start[label=\"START\"]") + IO.puts(" end[label=\"END\"]") + + for v <- :digraph.vertices(sentence.tokenizations), v != :start, v != :end do + IO.puts(" #{vertex_id(v)}[label=\"#{v.value}\"]") + end + + for e <- :digraph.edges(sentence.tokenizations) do + {_, from, to, _} = :digraph.edge(sentence.tokenizations, e) + + IO.puts(" #{vertex_id(from)} -> #{vertex_id(to)}") + end + + IO.puts("}") + end + + defp vertex_id(:start), do: "start" + defp vertex_id(:end), do: "end" + defp vertex_id(v), do: "v#{v.index}" end defimpl String.Chars, for: BubbleMatch.Sentence do diff --git a/lib/bubble_match/token.ex b/lib/bubble_match/token.ex index e6768b9..e01922b 100644 --- a/lib/bubble_match/token.ex +++ b/lib/bubble_match/token.ex @@ -65,6 +65,17 @@ defmodule BubbleMatch.Token do } end + @doc """ + Test whether a token is punctuation + """ + def punct?(%M{type: :punct}) do + true + end + + def punct?(token) do + pos?(token, "PUNCT") + end + @doc """ Test whether a token mathces the given POS (part-of-speech) tag. """ diff --git a/test/bubble_match/sentence_test.exs b/test/bubble_match/sentence_test.exs index a2b990c..702b17e 100644 --- a/test/bubble_match/sentence_test.exs +++ b/test/bubble_match/sentence_test.exs @@ -3,6 +3,11 @@ defmodule BubbleMatch.SentenceTest do alias BubbleMatch.{Entity, Sentence} + test "tokenize" do + sentence = Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away") + Sentence.print_dot(sentence) + end + @spacy_json """ {"ents":[{"end":27,"label":"PERSON","start":21}],"sents":[{"end":9,"start":0},{"end":27,"start":10}],"text":"Hi there. My name is George","tokens":[{"dep":"ROOT","end":2,"head":0,"id":0,"lemma":"hi","norm":"hi","pos":"INTJ","start":0,"string":"Hi ","tag":"UH"},{"dep":"advmod","end":8,"head":0,"id":1,"lemma":"there","norm":"there","pos":"ADV","start":3,"string":"there","tag":"RB"},{"dep":"punct","end":9,"head":0,"id":2,"lemma":".","norm":".","pos":"PUNCT","start":8,"string":". ","tag":"."},{"dep":"poss","end":12,"head":4,"id":3,"lemma":"-PRON-","norm":"my","pos":"DET","start":10,"string":"My ","tag":"PRP$"},{"dep":"nsubj","end":17,"head":5,"id":4,"lemma":"name","norm":"name","pos":"NOUN","start":13,"string":"name ","tag":"NN"},{"dep":"ROOT","end":20,"head":5,"id":5,"lemma":"be","norm":"is","pos":"AUX","start":18,"string":"is ","tag":"VBZ"},{"dep":"attr","end":27,"head":5,"id":6,"lemma":"George","norm":"george","pos":"PROPN","start":21,"string":"George","tag":"NNP"}]} """ From 37217bc047575d6cc96f378551691c18ce91ddf9 Mon Sep 17 00:00:00 2001 From: Arjan Scherpenisse Date: Mon, 29 Jun 2020 09:36:05 +0200 Subject: [PATCH 2/4] Skip all punct tokens in one path --- lib/bubble_match/sentence.ex | 33 ++++++++++++++++++++++++++--- test/bubble_match/sentence_test.exs | 2 +- 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/lib/bubble_match/sentence.ex b/lib/bubble_match/sentence.ex index 7604a99..9918b04 100644 --- a/lib/bubble_match/sentence.ex +++ b/lib/bubble_match/sentence.ex @@ -190,11 +190,38 @@ defmodule BubbleMatch.Sentence do :digraph.add_vertex(graph, b) :digraph.add_edge(graph, prev, a) - if Token.punct?(a) do - :digraph.add_edge(graph, prev, b) + case Token.punct?(a) do + true -> + case eat_punct([b | rest], a, graph) do + [nonpunct | rest] -> + :digraph.add_vertex(graph, nonpunct) + :digraph.add_edge(graph, prev, nonpunct) + build_token_graph(rest, nonpunct, graph) + + [] -> + graph + end + + false -> + build_token_graph([b | rest], a, graph) end + end + + defp eat_punct([], _prev, _graph) do + [] + end - build_token_graph([b | rest], a, graph) + defp eat_punct([t | rest], prev, graph) do + :digraph.add_vertex(graph, t) + :digraph.add_edge(graph, prev, t) + + case Token.punct?(t) do + true -> + eat_punct(rest, t, graph) + + false -> + [t | rest] + end end def print_dot(sentence) do diff --git a/test/bubble_match/sentence_test.exs b/test/bubble_match/sentence_test.exs index 702b17e..001d2ec 100644 --- a/test/bubble_match/sentence_test.exs +++ b/test/bubble_match/sentence_test.exs @@ -4,7 +4,7 @@ defmodule BubbleMatch.SentenceTest do alias BubbleMatch.{Entity, Sentence} test "tokenize" do - sentence = Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away") + sentence = Sentence.naive_tokenize("My birthday,,, is the day after tomorrow, 10 miles away") Sentence.print_dot(sentence) end From 1f9f95d733a847947a5ca59a804aeae910d49cf9 Mon Sep 17 00:00:00 2001 From: Arjan Scherpenisse Date: Mon, 29 Jun 2020 21:45:14 +0200 Subject: [PATCH 3/4] Implement digraph for Spacy and Duckling --- lib/bubble_match/sentence.ex | 207 +++++++++------------------- lib/bubble_match/token.ex | 3 + test/bubble_match/sentence_test.exs | 33 +++-- 3 files changed, 95 insertions(+), 148 deletions(-) diff --git a/lib/bubble_match/sentence.ex b/lib/bubble_match/sentence.ex index 0b41060..b97ff4a 100644 --- a/lib/bubble_match/sentence.ex +++ b/lib/bubble_match/sentence.ex @@ -40,49 +40,38 @@ defmodule BubbleMatch.Sentence do def naive_tokenize(input) when is_binary(input) do tokens = Tokenizer.tokenize(input) - graph = build_token_graph(tokens) + graph = new_graph() |> build_token_graph(tokens) %M{text: input, tokenizations: graph} end @doc """ - Convert a JSON blob from Spacy NLP data into a list of sentences + Convert a JSON blob from Spacy NLP data into a sentence. This function takes the output of Spacy's [Doc.to_json][spacy] - function and converts it into a list of sentences. + function and creates a Sentence struct from it. Note that the struct + might actually contain more than one sentence. [spacy]: https://spacy.io/api/doc#to_json """ - @spec sentences_from_spacy(spacy_json :: map()) :: [t()] - def sentences_from_spacy(spacy_json) do - spacy_sentences_split(spacy_json["sents"], spacy_json, []) - |> Enum.map(fn {text, tokens, entities} -> - %M{text: text, tokenizations: both_if_different(no_punct(tokens), tokens)} - |> add_spacy_entities(entities, spacy_json) - end) - end - - defp spacy_sentences_split([], _result, acc) do - Enum.reverse(acc) - end + @spec from_spacy(spacy_json :: map()) :: [t()] + def from_spacy(spacy_json) do + text = spacy_json["text"] - defp spacy_sentences_split([%{"start" => start, "end" => end_} | rest], result, acc) do - s_text = String.slice(result["text"], start, end_ - start) - - s_tokens = - result["tokens"] - |> Enum.filter(&(&1["start"] >= start && &1["end"] <= end_)) + tokens = + spacy_json["tokens"] |> Enum.map(&Token.from_spacy/1) - |> reindex() - s_ents = result["ents"] |> Enum.filter(&(&1["start"] >= start && &1["end"] <= end_)) - spacy_sentences_split(rest, result, [{s_text, s_tokens, s_ents} | acc]) - end + graph = new_graph() - defp add_spacy_entities(%M{} = m, [], _), do: m + for %{"start" => start, "end" => end_} <- spacy_json["sents"] do + ts = Enum.filter(tokens, &(&1.start >= start && &1.end <= end_)) + build_token_graph(graph, ts) + end - defp add_spacy_entities(%M{} = m, ents, %{"text" => text}) do - sequences = Enum.map(ents, &[Token.from_spacy_entity(&1, text)]) - add_tokenization(m, sequences) + # add entities + ents = Enum.map(spacy_json["ents"], &Token.from_spacy_entity(&1, text)) + add_entities(graph, ents) + %M{text: text, tokenizations: graph} end @doc """ @@ -98,70 +87,22 @@ defmodule BubbleMatch.Sentence do def add_duckling_entities(%M{} = sentence, []), do: sentence def add_duckling_entities(%M{} = sentence, entities) do - sequences = Enum.map(entities, &[Token.from_duckling_entity(&1)]) - add_tokenization(sentence, sequences) - end - - @doc false - def add_tokenization(%M{} = m, replace_token_sequences) do - raw_tokens = List.last(m.tokenizations) - - tokenization = - replace_token_sequences - |> Enum.reduce(raw_tokens, fn seq, toks -> - replace_tokens(toks, seq) - end) + ents = Enum.map(entities, &Token.from_duckling_entity(&1)) - tokenizations = both_if_different(no_punct(tokenization), tokenization) - %M{m | tokenizations: Enum.uniq(tokenizations ++ m.tokenizations)} + add_entities(sentence.tokenizations, ents) + sentence end - defp replace_tokens(token_sequence, replace_tokens) do - # find - start = List.first(replace_tokens).start - end_ = List.last(replace_tokens).end - - start_idx = Enum.find_index(token_sequence, &(&1.start == start)) - end_idx = Enum.find_index(token_sequence, &(&1.end == end_)) - - cond do - start_idx != nil and end_idx != nil and end_idx >= start_idx -> - {a, _} = Enum.split(token_sequence, start_idx) - {_, b} = Enum.split(token_sequence, end_idx + 1) - - (a ++ replace_tokens ++ b) - |> reindex() - - start_idx != nil and end_idx == nil -> - {a, _} = Enum.split(token_sequence, start_idx) - - (a ++ replace_tokens) - |> reindex() - - start_idx == nil and end_idx != nil -> - {_, b} = Enum.split(token_sequence, end_idx + 1) - - (replace_tokens ++ b) - |> reindex() - - true -> - # raise RuntimeError, "Token not found at start = #{start}, end = #{end_}" - token_sequence - end - end - - defp reindex(tokens) do - tokens - |> Enum.with_index() - |> Enum.map(fn {t, index} -> - %{t | index: index} - end) - end + ### - defp build_token_graph(tokens) do + defp new_graph() do graph = :digraph.new([:acyclic]) :digraph.add_vertex(graph, :start) :digraph.add_vertex(graph, :end) + graph + end + + defp build_token_graph(graph, tokens) do build_token_graph(tokens, :start, graph) end @@ -175,11 +116,6 @@ defmodule BubbleMatch.Sentence do :digraph.add_vertex(graph, :end) :digraph.add_edge(graph, prev, last) :digraph.add_edge(graph, last, :end) - - if Token.punct?(last) do - :digraph.add_edge(graph, prev, :end) - end - graph end @@ -187,70 +123,61 @@ defmodule BubbleMatch.Sentence do :digraph.add_vertex(graph, a) :digraph.add_vertex(graph, b) :digraph.add_edge(graph, prev, a) - - case Token.punct?(a) do - true -> - case eat_punct([b | rest], a, graph) do - [nonpunct | rest] -> - :digraph.add_vertex(graph, nonpunct) - :digraph.add_edge(graph, prev, nonpunct) - build_token_graph(rest, nonpunct, graph) - - [] -> - graph - end - - false -> - build_token_graph([b | rest], a, graph) - end + build_token_graph([b | rest], a, graph) end - defp eat_punct([], _prev, _graph) do - [] - end - - defp eat_punct([t | rest], prev, graph) do - :digraph.add_vertex(graph, t) - :digraph.add_edge(graph, prev, t) - - case Token.punct?(t) do - true -> - eat_punct(rest, t, graph) + defp add_entities(graph, ents) do + for %{start: start, end: end_} = ent <- ents do + ent = Map.put(ent, :index, :erlang.system_time()) + :digraph.add_vertex(graph, ent) - false -> - [t | rest] - end - end + t_start = + :digraph.vertices(graph) + |> Enum.find(&(is_map(&1) && &1.end == start - 1)) - def print_dot(sentence) do - IO.puts("digraph {") + if t_start do + :digraph.add_edge(graph, t_start, ent) + else + :digraph.add_edge(graph, :start, ent) + end - IO.puts(" start[label=\"START\"]") - IO.puts(" end[label=\"END\"]") + t_end = + :digraph.vertices(graph) + |> Enum.find(&(is_map(&1) && (&1.start == end_ + 1 || &1.start == end_))) - for v <- :digraph.vertices(sentence.tokenizations), v != :start, v != :end do - IO.puts(" #{vertex_id(v)}[label=\"#{v.value}\"]") + if t_end do + :digraph.add_edge(graph, ent, t_end) + else + :digraph.add_edge(graph, ent, :end) + end end - for e <- :digraph.edges(sentence.tokenizations) do - {_, from, to, _} = :digraph.edge(sentence.tokenizations, e) + graph + end - IO.puts(" #{vertex_id(from)} -> #{vertex_id(to)}") - end + def make_dot(sentence) do + [ + "digraph {", + " start[label=\"START\"]", + " end[label=\"END\"]", + for v <- :digraph.vertices(sentence.tokenizations), v != :start, v != :end do + " #{vertex_id(v)}[label=\"#{v}\"]" + end, + for e <- :digraph.edges(sentence.tokenizations) do + {_, from, to, _} = :digraph.edge(sentence.tokenizations, e) - IO.puts("}") + " #{vertex_id(from)} -> #{vertex_id(to)}" + end, + "}" + ] + |> List.flatten() + |> Enum.intersperse("\n") + |> IO.chardata_to_string() end defp vertex_id(:start), do: "start" defp vertex_id(:end), do: "end" defp vertex_id(v), do: "v#{v.index}" - defp both_if_different(a, b, rest \\ []) - defp both_if_different(a, a, rest), do: [a | rest] - defp both_if_different(a, b, rest), do: [a, b | rest] - - defp no_punct(tokens) do - tokens |> Enum.reject(&Token.punct?/1) - end end defimpl String.Chars, for: BubbleMatch.Sentence do diff --git a/lib/bubble_match/token.ex b/lib/bubble_match/token.ex index baf446f..c1fa068 100644 --- a/lib/bubble_match/token.ex +++ b/lib/bubble_match/token.ex @@ -160,6 +160,9 @@ defmodule BubbleMatch.Token do end defimpl String.Chars, for: BubbleMatch.Token do + def to_string(%BubbleMatch.Token{type: :entity, raw: raw, value: value}), + do: "#{raw} [#{value.kind}]" + def to_string(%BubbleMatch.Token{raw: raw}), do: raw end diff --git a/test/bubble_match/sentence_test.exs b/test/bubble_match/sentence_test.exs index df1cebf..82a8673 100644 --- a/test/bubble_match/sentence_test.exs +++ b/test/bubble_match/sentence_test.exs @@ -4,8 +4,14 @@ defmodule BubbleMatch.SentenceTest do alias BubbleMatch.{Entity, Sentence} test "tokenize" do - sentence = Sentence.naive_tokenize("My birthday,,, is the day after tomorrow, 10 miles away") - Sentence.print_dot(sentence) + sentence = Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away") + + graph = Sentence.make_dot(sentence) + assert String.contains?(graph, "start -> v0") + assert String.contains?(graph, "v0 -> v1") + assert String.contains?(graph, "v1 -> v2") + # punct is skipped + assert String.contains?(graph, "v1 -> v3") end @spacy_json """ @@ -14,16 +20,18 @@ defmodule BubbleMatch.SentenceTest do |> Jason.decode!() test "from_spacy" do - [hithere, mynameis] = Sentence.sentences_from_spacy(@spacy_json) + sentence = Sentence.from_spacy(@spacy_json) - assert [_, [_, _, _]] = hithere.tokenizations + view_graph(sentence) + # System.cmd("dot", ["-Tpng", "/tmp/x.dot"]) + # assert [_, [_, _, _]] = hithere.tokenizations - assert [with_ents, raw_tokens] = mynameis.tokenizations + # assert [with_ents, raw_tokens] = mynameis.tokenizations - assert ~w(my name is george) == Enum.map(raw_tokens, & &1.value["norm"]) - assert ~w(spacy spacy spacy entity)a == Enum.map(with_ents, & &1.type) + # assert ~w(my name is george) == Enum.map(raw_tokens, & &1.value["norm"]) + # assert ~w(spacy spacy spacy entity)a == Enum.map(with_ents, & &1.type) - assert [_, _, _, %{value: %Entity{value: "George"}}] = with_ents + # assert [_, _, _, %{value: %Entity{value: "George"}}] = with_ents end test "match from spacy" do @@ -62,6 +70,8 @@ defmodule BubbleMatch.SentenceTest do Sentence.naive_tokenize("My birthday is the day after tomorrow, 10 miles away") |> Sentence.add_duckling_entities(@duckling_json) + view_graph(sentence) + assert [with_ents, with_ents_punct | _] = sentence.tokenizations assert [ @@ -132,4 +142,11 @@ defmodule BubbleMatch.SentenceTest do assert [with_ents, _raw_tokens] = b.tokenizations assert List.first(with_ents).value.kind == "time" end + + defp view_graph(sentence) do + graph = Sentence.make_dot(sentence) + + File.write!("/tmp/x.dot", graph) + :os.cmd('dot /tmp/x.dot -Tpng > /tmp/x.png; eog /tmp/x.png') + end end From a99b3098b964f3fa6a086816ea3aeb48c1a08d70 Mon Sep 17 00:00:00 2001 From: Arjan Scherpenisse Date: Thu, 16 Jul 2020 21:59:42 +0200 Subject: [PATCH 4/4] Switch to libgraph --- lib/bubble_match/sentence.ex | 157 ++++++++++++++++++---------- mix.exs | 1 + mix.lock | 1 + test/bubble_match/sentence_test.exs | 24 ++--- 4 files changed, 115 insertions(+), 68 deletions(-) diff --git a/lib/bubble_match/sentence.ex b/lib/bubble_match/sentence.ex index b97ff4a..c803e01 100644 --- a/lib/bubble_match/sentence.ex +++ b/lib/bubble_match/sentence.ex @@ -35,7 +35,7 @@ defmodule BubbleMatch.Sentence do def naive_tokenize(input) def naive_tokenize("") do - %M{text: "", tokenizations: :digraph.new()} + %M{text: "", tokenizations: new_graph()} end def naive_tokenize(input) when is_binary(input) do @@ -62,15 +62,27 @@ defmodule BubbleMatch.Sentence do |> Enum.map(&Token.from_spacy/1) graph = new_graph() + sents = spacy_json["sents"] - for %{"start" => start, "end" => end_} <- spacy_json["sents"] do - ts = Enum.filter(tokens, &(&1.start >= start && &1.end <= end_)) - build_token_graph(graph, ts) - end + # add all sentences + graph = + Enum.reduce(sents, graph, fn %{"start" => start, "end" => end_}, graph -> + ts = Enum.filter(tokens, &(&1.start >= start && &1.end <= end_)) + build_token_graph(graph, ts) + end) + + [_ | pairs] = Enum.zip([nil | sents], sents) + + # add edge between sentences + graph = + Enum.reduce(pairs, graph, fn {%{"end" => end_}, %{"start" => start}}, graph -> + {t_start, t_end} = find_start_end(graph, start, end_) + Graph.add_edge(graph, t_start, t_end) + end) # add entities ents = Enum.map(spacy_json["ents"], &Token.from_spacy_entity(&1, text)) - add_entities(graph, ents) + graph = add_entities(graph, ents) %M{text: text, tokenizations: graph} end @@ -89,70 +101,107 @@ defmodule BubbleMatch.Sentence do def add_duckling_entities(%M{} = sentence, entities) do ents = Enum.map(entities, &Token.from_duckling_entity(&1)) - add_entities(sentence.tokenizations, ents) - sentence + graph = add_entities(sentence.tokenizations, ents) + %M{sentence | tokenizations: graph} + end + + def skip_punct(%M{tokenizations: graph} = m) do + graph = + Enum.reduce(Graph.vertices(graph), graph, fn v, graph -> + connect_punct(graph, v, nil) + end) + + %{m | tokenizations: graph} end ### + defp connect_punct(graph, v, first) do + case out_vertices(graph, v) |> Enum.split_with(&Token.punct?/1) do + {[], []} -> + graph + + {p, []} -> + Enum.reduce(p, graph, fn v2, graph -> + connect_punct(graph, v2, first || v) + end) + + {_, vs} -> + if first && not Token.punct?(first) do + Enum.reduce(vs, graph, fn v2, graph -> + Graph.add_edge(graph, first, v2) + end) + else + graph + end + end + end + defp new_graph() do - graph = :digraph.new([:acyclic]) - :digraph.add_vertex(graph, :start) - :digraph.add_vertex(graph, :end) - graph + Graph.new(type: :directed) + |> Graph.add_vertices([:start, :end]) end defp build_token_graph(graph, tokens) do - build_token_graph(tokens, :start, graph) + build_token_graph(graph, tokens, :start) end - defp build_token_graph([], _prev, graph) do + defp build_token_graph(graph, [], _prev) do graph end - defp build_token_graph([last], prev, graph) do - :digraph.add_vertex(graph, last) - :digraph.add_vertex(graph, prev) - :digraph.add_vertex(graph, :end) - :digraph.add_edge(graph, prev, last) - :digraph.add_edge(graph, last, :end) + defp build_token_graph(graph, [last], prev) do graph + |> Graph.add_vertices([last, prev, :end]) + |> Graph.add_edge(prev, last) + |> Graph.add_edge(last, :end) end - defp build_token_graph([a, b | rest], prev, graph) do - :digraph.add_vertex(graph, a) - :digraph.add_vertex(graph, b) - :digraph.add_edge(graph, prev, a) - build_token_graph([b | rest], a, graph) + defp build_token_graph(graph, [a, b | rest], prev) do + graph + |> Graph.add_vertices([a, b]) + |> Graph.add_edge(prev, a) + |> build_token_graph([b | rest], a) + end + + defp find_start_end(graph, start, end_) do + t_start = + Graph.vertices(graph) + |> Enum.find(&(is_map(&1) && &1.end == start - 1)) + + t_end = + Graph.vertices(graph) + |> Enum.find(&(is_map(&1) && (&1.start == end_ + 1 || &1.start == end_))) + + {t_start, t_end} end defp add_entities(graph, ents) do - for %{start: start, end: end_} = ent <- ents do - ent = Map.put(ent, :index, :erlang.system_time()) - :digraph.add_vertex(graph, ent) - - t_start = - :digraph.vertices(graph) - |> Enum.find(&(is_map(&1) && &1.end == start - 1)) - - if t_start do - :digraph.add_edge(graph, t_start, ent) - else - :digraph.add_edge(graph, :start, ent) - end - - t_end = - :digraph.vertices(graph) - |> Enum.find(&(is_map(&1) && (&1.start == end_ + 1 || &1.start == end_))) - - if t_end do - :digraph.add_edge(graph, ent, t_end) - else - :digraph.add_edge(graph, ent, :end) - end - end + Enum.reduce(ents, graph, fn %{start: start, end: end_} = ent, graph -> + {t_start, t_end} = find_start_end(graph, start, end_) + graph = Graph.add_vertex(graph, ent) + + graph = + if t_start do + Graph.add_edge(graph, t_start, ent) + else + Graph.add_edge(graph, :start, ent) + end + + graph = + if t_end do + Graph.add_edge(graph, ent, t_end) + else + Graph.add_edge(graph, ent, :end) + end + + graph + end) + end - graph + def out_vertices(graph, vertex) do + Graph.out_edges(graph, vertex) + |> Enum.map(fn e -> e.v2 end) end def make_dot(sentence) do @@ -160,13 +209,11 @@ defmodule BubbleMatch.Sentence do "digraph {", " start[label=\"START\"]", " end[label=\"END\"]", - for v <- :digraph.vertices(sentence.tokenizations), v != :start, v != :end do + for v <- Graph.vertices(sentence.tokenizations), v != :start, v != :end do " #{vertex_id(v)}[label=\"#{v}\"]" end, - for e <- :digraph.edges(sentence.tokenizations) do - {_, from, to, _} = :digraph.edge(sentence.tokenizations, e) - - " #{vertex_id(from)} -> #{vertex_id(to)}" + for e <- Graph.edges(sentence.tokenizations) do + " #{vertex_id(e.v1)} -> #{vertex_id(e.v2)}" end, "}" ] diff --git a/mix.exs b/mix.exs index ed0485c..6ed87a3 100644 --- a/mix.exs +++ b/mix.exs @@ -44,6 +44,7 @@ defmodule BubbleMatch.MixProject do {:nimble_parsec, "~> 0.5.3"}, {:inflex, "~> 2.0"}, {:jason, "~> 1.0"}, + {:libgraph, "~> 0.13"}, {:ex_doc, ">= 0.0.0", only: :dev}, {:stream_data, "~> 0.1", only: :test} ] diff --git a/mix.lock b/mix.lock index 3522b15..9ac1ee3 100644 --- a/mix.lock +++ b/mix.lock @@ -9,6 +9,7 @@ "idna": {:hex, :idna, "6.0.1", "1d038fb2e7668ce41fbf681d2c45902e52b3cb9e9c77b55334353b222c2ee50c", [:rebar3], [{:unicode_util_compat, "0.5.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a02c8a1c4fd601215bb0b0324c8a6986749f807ce35f25449ec9e69758708122"}, "inflex": {:hex, :inflex, "2.0.0", "db69d542b8fdb23ac667f9bc0c2395a3983fa2da6ae2efa7ab5dc541928f7a75", [:mix], [], "hexpm", "c018852409bd48b03ad96ed53594186bc074bdd1519043a0ad1fa5697aac4399"}, "jason": {:hex, :jason, "1.2.0", "10043418c42d2493d0ee212d3fddd25d7ffe484380afad769a0a38795938e448", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "116747dbe057794c3a3e4e143b7c8390b29f634e16c78a7f59ba75bfa6852e7f"}, + "libgraph": {:hex, :libgraph, "0.13.3", "20732b7bafb933dcf7351c479e03076ebd14a85fd3202c67a1c197f4f7c2466b", [:mix], [], "hexpm", "78f2576eef615440b46f10060b1de1c86640441422832052686df53dc3c148c6"}, "makeup": {:hex, :makeup, "1.0.1", "82f332e461dc6c79dbd82fbe2a9c10d48ed07146f0a478286e590c83c52010b5", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "49736fe5b66a08d8575bf5321d716bac5da20c8e6b97714fec2bcd6febcfa1f8"}, "makeup_elixir": {:hex, :makeup_elixir, "0.14.0", "cf8b7c66ad1cff4c14679698d532f0b5d45a3968ffbcbfd590339cb57742f1ae", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "d4b316c7222a85bbaa2fd7c6e90e37e953257ad196dc229505137c5e505e9eff"}, "match_engine": {:hex, :match_engine, "1.4.5", "d57752c6cc799f5eca89b564a11b3ca0f35253aa4efc9cc35fd1bbb7f35eee42", [:mix], [{:simetric, "~> 0.2.0", [hex: :simetric, repo: "hexpm", optional: false]}, {:timex, "~> 3.1", [hex: :timex, repo: "hexpm", optional: false]}], "hexpm", "95356a6d3bdbf6667f6d2801a97a30dd8a95400b8c5eeb570e3a8e6c86b5373c"}, diff --git a/test/bubble_match/sentence_test.exs b/test/bubble_match/sentence_test.exs index 82a8673..73606f3 100644 --- a/test/bubble_match/sentence_test.exs +++ b/test/bubble_match/sentence_test.exs @@ -4,9 +4,13 @@ defmodule BubbleMatch.SentenceTest do alias BubbleMatch.{Entity, Sentence} test "tokenize" do - sentence = Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away") + sentence = + Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away") + |> Sentence.skip_punct() graph = Sentence.make_dot(sentence) + # IO.puts(graph) + # view_graph(sentence) assert String.contains?(graph, "start -> v0") assert String.contains?(graph, "v0 -> v1") assert String.contains?(graph, "v1 -> v2") @@ -69,6 +73,7 @@ defmodule BubbleMatch.SentenceTest do sentence = Sentence.naive_tokenize("My birthday is the day after tomorrow, 10 miles away") |> Sentence.add_duckling_entities(@duckling_json) + |> Sentence.skip_punct() view_graph(sentence) @@ -125,22 +130,15 @@ defmodule BubbleMatch.SentenceTest do |> Jason.decode!() @time_spacy """ - {"text":"9 p.m.","ents":[],"sents":[{"start":0,"end":1},{"start":2,"end":6}],"tokens":[{"id":0,"start":0,"end":1,"pos":"NUM","tag":"CD","dep":"ROOT","head":0,"string":"9 ","lemma":"9","norm":"9"},{"id":1,"start":2,"end":6,"pos":"NOUN","tag":"NN","dep":"ROOT","head":1,"string":"p.m.","lemma":"p.m.","norm":"p.m."}]} + {"text":"9 p.m.","ents":[],"sents":[{"start":0,"end":9}],"tokens":[{"id":0,"start":0,"end":1,"pos":"NUM","tag":"CD","dep":"ROOT","head":0,"string":"9 ","lemma":"9","norm":"9"},{"id":1,"start":2,"end":6,"pos":"NOUN","tag":"NN","dep":"ROOT","head":1,"string":"p.m.","lemma":"p.m.","norm":"p.m."}]} """ |> Jason.decode!() test "overlapping duckling entities" do - [a, b] = Sentence.sentences_from_spacy(@time_spacy) - - assert [_] = a.tokenizations - a = a |> Sentence.add_duckling_entities(@time_duckling) - assert [with_ents, _raw_tokens] = a.tokenizations - assert List.first(with_ents).value.kind == "time" - - assert [_] = b.tokenizations - b = b |> Sentence.add_duckling_entities(@time_duckling) - assert [with_ents, _raw_tokens] = b.tokenizations - assert List.first(with_ents).value.kind == "time" + Sentence.from_spacy(@time_spacy) + |> Sentence.add_duckling_entities(@time_duckling) + |> Sentence.skip_punct() + |> view_graph() end defp view_graph(sentence) do