diff --git a/lib/bubble_match/sentence.ex b/lib/bubble_match/sentence.ex index ad8565f..c803e01 100644 --- a/lib/bubble_match/sentence.ex +++ b/lib/bubble_match/sentence.ex @@ -35,53 +35,55 @@ defmodule BubbleMatch.Sentence do def naive_tokenize(input) def naive_tokenize("") do - %M{text: "", tokenizations: [[]]} + %M{text: "", tokenizations: new_graph()} end def naive_tokenize(input) when is_binary(input) do tokens = Tokenizer.tokenize(input) - %M{text: input, tokenizations: both_if_different(no_punct(tokens), tokens)} + graph = new_graph() |> build_token_graph(tokens) + %M{text: input, tokenizations: graph} end @doc """ - Convert a JSON blob from Spacy NLP data into a list of sentences + Convert a JSON blob from Spacy NLP data into a sentence. This function takes the output of Spacy's [Doc.to_json][spacy] - function and converts it into a list of sentences. + function and creates a Sentence struct from it. Note that the struct + might actually contain more than one sentence. [spacy]: https://spacy.io/api/doc#to_json """ - @spec sentences_from_spacy(spacy_json :: map()) :: [t()] - def sentences_from_spacy(spacy_json) do - spacy_sentences_split(spacy_json["sents"], spacy_json, []) - |> Enum.map(fn {text, tokens, entities} -> - %M{text: text, tokenizations: both_if_different(no_punct(tokens), tokens)} - |> add_spacy_entities(entities, spacy_json) - end) - end + @spec from_spacy(spacy_json :: map()) :: [t()] + def from_spacy(spacy_json) do + text = spacy_json["text"] - defp spacy_sentences_split([], _result, acc) do - Enum.reverse(acc) - end + tokens = + spacy_json["tokens"] + |> Enum.map(&Token.from_spacy/1) - defp spacy_sentences_split([%{"start" => start, "end" => end_} | rest], result, acc) do - s_text = String.slice(result["text"], start, end_ - start) + graph = new_graph() + sents = spacy_json["sents"] - s_tokens = - result["tokens"] - |> Enum.filter(&(&1["start"] >= start && &1["end"] <= end_)) - |> Enum.map(&Token.from_spacy/1) - |> reindex() + # add all sentences + graph = + Enum.reduce(sents, graph, fn %{"start" => start, "end" => end_}, graph -> + ts = Enum.filter(tokens, &(&1.start >= start && &1.end <= end_)) + build_token_graph(graph, ts) + end) - s_ents = result["ents"] |> Enum.filter(&(&1["start"] >= start && &1["end"] <= end_)) - spacy_sentences_split(rest, result, [{s_text, s_tokens, s_ents} | acc]) - end + [_ | pairs] = Enum.zip([nil | sents], sents) - defp add_spacy_entities(%M{} = m, [], _), do: m + # add edge between sentences + graph = + Enum.reduce(pairs, graph, fn {%{"end" => end_}, %{"start" => start}}, graph -> + {t_start, t_end} = find_start_end(graph, start, end_) + Graph.add_edge(graph, t_start, t_end) + end) - defp add_spacy_entities(%M{} = m, ents, %{"text" => text}) do - sequences = Enum.map(ents, &[Token.from_spacy_entity(&1, text)]) - add_tokenization(m, sequences) + # add entities + ents = Enum.map(spacy_json["ents"], &Token.from_spacy_entity(&1, text)) + graph = add_entities(graph, ents) + %M{text: text, tokenizations: graph} end @doc """ @@ -97,73 +99,132 @@ defmodule BubbleMatch.Sentence do def add_duckling_entities(%M{} = sentence, []), do: sentence def add_duckling_entities(%M{} = sentence, entities) do - sequences = Enum.map(entities, &[Token.from_duckling_entity(&1)]) - add_tokenization(sentence, sequences) - end + ents = Enum.map(entities, &Token.from_duckling_entity(&1)) - @doc false - def add_tokenization(%M{} = m, replace_token_sequences) do - raw_tokens = List.last(m.tokenizations) + graph = add_entities(sentence.tokenizations, ents) + %M{sentence | tokenizations: graph} + end - tokenization = - replace_token_sequences - |> Enum.reduce(raw_tokens, fn seq, toks -> - replace_tokens(toks, seq) + def skip_punct(%M{tokenizations: graph} = m) do + graph = + Enum.reduce(Graph.vertices(graph), graph, fn v, graph -> + connect_punct(graph, v, nil) end) - tokenizations = both_if_different(no_punct(tokenization), tokenization) - %M{m | tokenizations: Enum.uniq(tokenizations ++ m.tokenizations)} + %{m | tokenizations: graph} end - defp replace_tokens(token_sequence, replace_tokens) do - # find - start = List.first(replace_tokens).start - end_ = List.last(replace_tokens).end + ### + + defp connect_punct(graph, v, first) do + case out_vertices(graph, v) |> Enum.split_with(&Token.punct?/1) do + {[], []} -> + graph + + {p, []} -> + Enum.reduce(p, graph, fn v2, graph -> + connect_punct(graph, v2, first || v) + end) + + {_, vs} -> + if first && not Token.punct?(first) do + Enum.reduce(vs, graph, fn v2, graph -> + Graph.add_edge(graph, first, v2) + end) + else + graph + end + end + end - start_idx = Enum.find_index(token_sequence, &(&1.start == start)) - end_idx = Enum.find_index(token_sequence, &(&1.end == end_)) + defp new_graph() do + Graph.new(type: :directed) + |> Graph.add_vertices([:start, :end]) + end - cond do - start_idx != nil and end_idx != nil and end_idx >= start_idx -> - {a, _} = Enum.split(token_sequence, start_idx) - {_, b} = Enum.split(token_sequence, end_idx + 1) + defp build_token_graph(graph, tokens) do + build_token_graph(graph, tokens, :start) + end - (a ++ replace_tokens ++ b) - |> reindex() + defp build_token_graph(graph, [], _prev) do + graph + end - start_idx != nil and end_idx == nil -> - {a, _} = Enum.split(token_sequence, start_idx) + defp build_token_graph(graph, [last], prev) do + graph + |> Graph.add_vertices([last, prev, :end]) + |> Graph.add_edge(prev, last) + |> Graph.add_edge(last, :end) + end - (a ++ replace_tokens) - |> reindex() + defp build_token_graph(graph, [a, b | rest], prev) do + graph + |> Graph.add_vertices([a, b]) + |> Graph.add_edge(prev, a) + |> build_token_graph([b | rest], a) + end - start_idx == nil and end_idx != nil -> - {_, b} = Enum.split(token_sequence, end_idx + 1) + defp find_start_end(graph, start, end_) do + t_start = + Graph.vertices(graph) + |> Enum.find(&(is_map(&1) && &1.end == start - 1)) - (replace_tokens ++ b) - |> reindex() + t_end = + Graph.vertices(graph) + |> Enum.find(&(is_map(&1) && (&1.start == end_ + 1 || &1.start == end_))) - true -> - # raise RuntimeError, "Token not found at start = #{start}, end = #{end_}" - token_sequence - end + {t_start, t_end} end - defp reindex(tokens) do - tokens - |> Enum.with_index() - |> Enum.map(fn {t, index} -> - %{t | index: index} + defp add_entities(graph, ents) do + Enum.reduce(ents, graph, fn %{start: start, end: end_} = ent, graph -> + {t_start, t_end} = find_start_end(graph, start, end_) + graph = Graph.add_vertex(graph, ent) + + graph = + if t_start do + Graph.add_edge(graph, t_start, ent) + else + Graph.add_edge(graph, :start, ent) + end + + graph = + if t_end do + Graph.add_edge(graph, ent, t_end) + else + Graph.add_edge(graph, ent, :end) + end + + graph end) end - defp both_if_different(a, b, rest \\ []) - defp both_if_different(a, a, rest), do: [a | rest] - defp both_if_different(a, b, rest), do: [a, b | rest] + def out_vertices(graph, vertex) do + Graph.out_edges(graph, vertex) + |> Enum.map(fn e -> e.v2 end) + end - defp no_punct(tokens) do - tokens |> Enum.reject(&Token.punct?/1) + def make_dot(sentence) do + [ + "digraph {", + " start[label=\"START\"]", + " end[label=\"END\"]", + for v <- Graph.vertices(sentence.tokenizations), v != :start, v != :end do + " #{vertex_id(v)}[label=\"#{v}\"]" + end, + for e <- Graph.edges(sentence.tokenizations) do + " #{vertex_id(e.v1)} -> #{vertex_id(e.v2)}" + end, + "}" + ] + |> List.flatten() + |> Enum.intersperse("\n") + |> IO.chardata_to_string() end + + defp vertex_id(:start), do: "start" + defp vertex_id(:end), do: "end" + defp vertex_id(v), do: "v#{v.index}" end defimpl String.Chars, for: BubbleMatch.Sentence do diff --git a/lib/bubble_match/token.ex b/lib/bubble_match/token.ex index baf446f..c1fa068 100644 --- a/lib/bubble_match/token.ex +++ b/lib/bubble_match/token.ex @@ -160,6 +160,9 @@ defmodule BubbleMatch.Token do end defimpl String.Chars, for: BubbleMatch.Token do + def to_string(%BubbleMatch.Token{type: :entity, raw: raw, value: value}), + do: "#{raw} [#{value.kind}]" + def to_string(%BubbleMatch.Token{raw: raw}), do: raw end diff --git a/mix.exs b/mix.exs index ed0485c..6ed87a3 100644 --- a/mix.exs +++ b/mix.exs @@ -44,6 +44,7 @@ defmodule BubbleMatch.MixProject do {:nimble_parsec, "~> 0.5.3"}, {:inflex, "~> 2.0"}, {:jason, "~> 1.0"}, + {:libgraph, "~> 0.13"}, {:ex_doc, ">= 0.0.0", only: :dev}, {:stream_data, "~> 0.1", only: :test} ] diff --git a/mix.lock b/mix.lock index 3522b15..9ac1ee3 100644 --- a/mix.lock +++ b/mix.lock @@ -9,6 +9,7 @@ "idna": {:hex, :idna, "6.0.1", "1d038fb2e7668ce41fbf681d2c45902e52b3cb9e9c77b55334353b222c2ee50c", [:rebar3], [{:unicode_util_compat, "0.5.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a02c8a1c4fd601215bb0b0324c8a6986749f807ce35f25449ec9e69758708122"}, "inflex": {:hex, :inflex, "2.0.0", "db69d542b8fdb23ac667f9bc0c2395a3983fa2da6ae2efa7ab5dc541928f7a75", [:mix], [], "hexpm", "c018852409bd48b03ad96ed53594186bc074bdd1519043a0ad1fa5697aac4399"}, "jason": {:hex, :jason, "1.2.0", "10043418c42d2493d0ee212d3fddd25d7ffe484380afad769a0a38795938e448", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "116747dbe057794c3a3e4e143b7c8390b29f634e16c78a7f59ba75bfa6852e7f"}, + "libgraph": {:hex, :libgraph, "0.13.3", "20732b7bafb933dcf7351c479e03076ebd14a85fd3202c67a1c197f4f7c2466b", [:mix], [], "hexpm", "78f2576eef615440b46f10060b1de1c86640441422832052686df53dc3c148c6"}, "makeup": {:hex, :makeup, "1.0.1", "82f332e461dc6c79dbd82fbe2a9c10d48ed07146f0a478286e590c83c52010b5", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "49736fe5b66a08d8575bf5321d716bac5da20c8e6b97714fec2bcd6febcfa1f8"}, "makeup_elixir": {:hex, :makeup_elixir, "0.14.0", "cf8b7c66ad1cff4c14679698d532f0b5d45a3968ffbcbfd590339cb57742f1ae", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "d4b316c7222a85bbaa2fd7c6e90e37e953257ad196dc229505137c5e505e9eff"}, "match_engine": {:hex, :match_engine, "1.4.5", "d57752c6cc799f5eca89b564a11b3ca0f35253aa4efc9cc35fd1bbb7f35eee42", [:mix], [{:simetric, "~> 0.2.0", [hex: :simetric, repo: "hexpm", optional: false]}, {:timex, "~> 3.1", [hex: :timex, repo: "hexpm", optional: false]}], "hexpm", "95356a6d3bdbf6667f6d2801a97a30dd8a95400b8c5eeb570e3a8e6c86b5373c"}, diff --git a/test/bubble_match/sentence_test.exs b/test/bubble_match/sentence_test.exs index bdd6283..73606f3 100644 --- a/test/bubble_match/sentence_test.exs +++ b/test/bubble_match/sentence_test.exs @@ -3,22 +3,39 @@ defmodule BubbleMatch.SentenceTest do alias BubbleMatch.{Entity, Sentence} + test "tokenize" do + sentence = + Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away") + |> Sentence.skip_punct() + + graph = Sentence.make_dot(sentence) + # IO.puts(graph) + # view_graph(sentence) + assert String.contains?(graph, "start -> v0") + assert String.contains?(graph, "v0 -> v1") + assert String.contains?(graph, "v1 -> v2") + # punct is skipped + assert String.contains?(graph, "v1 -> v3") + end + @spacy_json """ {"ents":[{"end":27,"label":"PERSON","start":21}],"sents":[{"end":9,"start":0},{"end":27,"start":10}],"text":"Hi there. My name is George","tokens":[{"dep":"ROOT","end":2,"head":0,"id":0,"lemma":"hi","norm":"hi","pos":"INTJ","start":0,"string":"Hi ","tag":"UH"},{"dep":"advmod","end":8,"head":0,"id":1,"lemma":"there","norm":"there","pos":"ADV","start":3,"string":"there","tag":"RB"},{"dep":"punct","end":9,"head":0,"id":2,"lemma":".","norm":".","pos":"PUNCT","start":8,"string":". ","tag":"."},{"dep":"poss","end":12,"head":4,"id":3,"lemma":"-PRON-","norm":"my","pos":"DET","start":10,"string":"My ","tag":"PRP$"},{"dep":"nsubj","end":17,"head":5,"id":4,"lemma":"name","norm":"name","pos":"NOUN","start":13,"string":"name ","tag":"NN"},{"dep":"ROOT","end":20,"head":5,"id":5,"lemma":"be","norm":"is","pos":"AUX","start":18,"string":"is ","tag":"VBZ"},{"dep":"attr","end":27,"head":5,"id":6,"lemma":"George","norm":"george","pos":"PROPN","start":21,"string":"George","tag":"NNP"}]} """ |> Jason.decode!() test "from_spacy" do - [hithere, mynameis] = Sentence.sentences_from_spacy(@spacy_json) + sentence = Sentence.from_spacy(@spacy_json) - assert [_, [_, _, _]] = hithere.tokenizations + view_graph(sentence) + # System.cmd("dot", ["-Tpng", "/tmp/x.dot"]) + # assert [_, [_, _, _]] = hithere.tokenizations - assert [with_ents, raw_tokens] = mynameis.tokenizations + # assert [with_ents, raw_tokens] = mynameis.tokenizations - assert ~w(my name is george) == Enum.map(raw_tokens, & &1.value["norm"]) - assert ~w(spacy spacy spacy entity)a == Enum.map(with_ents, & &1.type) + # assert ~w(my name is george) == Enum.map(raw_tokens, & &1.value["norm"]) + # assert ~w(spacy spacy spacy entity)a == Enum.map(with_ents, & &1.type) - assert [_, _, _, %{value: %Entity{value: "George"}}] = with_ents + # assert [_, _, _, %{value: %Entity{value: "George"}}] = with_ents end test "match from spacy" do @@ -56,6 +73,9 @@ defmodule BubbleMatch.SentenceTest do sentence = Sentence.naive_tokenize("My birthday is the day after tomorrow, 10 miles away") |> Sentence.add_duckling_entities(@duckling_json) + |> Sentence.skip_punct() + + view_graph(sentence) assert [with_ents, with_ents_punct | _] = sentence.tokenizations @@ -110,21 +130,21 @@ defmodule BubbleMatch.SentenceTest do |> Jason.decode!() @time_spacy """ - {"text":"9 p.m.","ents":[],"sents":[{"start":0,"end":1},{"start":2,"end":6}],"tokens":[{"id":0,"start":0,"end":1,"pos":"NUM","tag":"CD","dep":"ROOT","head":0,"string":"9 ","lemma":"9","norm":"9"},{"id":1,"start":2,"end":6,"pos":"NOUN","tag":"NN","dep":"ROOT","head":1,"string":"p.m.","lemma":"p.m.","norm":"p.m."}]} + {"text":"9 p.m.","ents":[],"sents":[{"start":0,"end":9}],"tokens":[{"id":0,"start":0,"end":1,"pos":"NUM","tag":"CD","dep":"ROOT","head":0,"string":"9 ","lemma":"9","norm":"9"},{"id":1,"start":2,"end":6,"pos":"NOUN","tag":"NN","dep":"ROOT","head":1,"string":"p.m.","lemma":"p.m.","norm":"p.m."}]} """ |> Jason.decode!() test "overlapping duckling entities" do - [a, b] = Sentence.sentences_from_spacy(@time_spacy) + Sentence.from_spacy(@time_spacy) + |> Sentence.add_duckling_entities(@time_duckling) + |> Sentence.skip_punct() + |> view_graph() + end - assert [_] = a.tokenizations - a = a |> Sentence.add_duckling_entities(@time_duckling) - assert [with_ents, _raw_tokens] = a.tokenizations - assert List.first(with_ents).value.kind == "time" + defp view_graph(sentence) do + graph = Sentence.make_dot(sentence) - assert [_] = b.tokenizations - b = b |> Sentence.add_duckling_entities(@time_duckling) - assert [with_ents, _raw_tokens] = b.tokenizations - assert List.first(with_ents).value.kind == "time" + File.write!("/tmp/x.dot", graph) + :os.cmd('dot /tmp/x.dot -Tpng > /tmp/x.png; eog /tmp/x.png') end end