Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 136 additions & 75 deletions lib/bubble_match/sentence.ex
Original file line number Diff line number Diff line change
Expand Up @@ -35,53 +35,55 @@ defmodule BubbleMatch.Sentence do
def naive_tokenize(input)

def naive_tokenize("") do
%M{text: "", tokenizations: [[]]}
%M{text: "", tokenizations: new_graph()}
end

def naive_tokenize(input) when is_binary(input) do
tokens = Tokenizer.tokenize(input)
%M{text: input, tokenizations: both_if_different(no_punct(tokens), tokens)}
graph = new_graph() |> build_token_graph(tokens)
%M{text: input, tokenizations: graph}
end

@doc """
Convert a JSON blob from Spacy NLP data into a list of sentences
Convert a JSON blob from Spacy NLP data into a sentence.

This function takes the output of Spacy's [Doc.to_json][spacy]
function and converts it into a list of sentences.
function and creates a Sentence struct from it. Note that the struct
might actually contain more than one sentence.

[spacy]: https://spacy.io/api/doc#to_json
"""
@spec sentences_from_spacy(spacy_json :: map()) :: [t()]
def sentences_from_spacy(spacy_json) do
spacy_sentences_split(spacy_json["sents"], spacy_json, [])
|> Enum.map(fn {text, tokens, entities} ->
%M{text: text, tokenizations: both_if_different(no_punct(tokens), tokens)}
|> add_spacy_entities(entities, spacy_json)
end)
end
@spec from_spacy(spacy_json :: map()) :: [t()]
def from_spacy(spacy_json) do
text = spacy_json["text"]

defp spacy_sentences_split([], _result, acc) do
Enum.reverse(acc)
end
tokens =
spacy_json["tokens"]
|> Enum.map(&Token.from_spacy/1)

defp spacy_sentences_split([%{"start" => start, "end" => end_} | rest], result, acc) do
s_text = String.slice(result["text"], start, end_ - start)
graph = new_graph()
sents = spacy_json["sents"]

s_tokens =
result["tokens"]
|> Enum.filter(&(&1["start"] >= start && &1["end"] <= end_))
|> Enum.map(&Token.from_spacy/1)
|> reindex()
# add all sentences
graph =
Enum.reduce(sents, graph, fn %{"start" => start, "end" => end_}, graph ->
ts = Enum.filter(tokens, &(&1.start >= start && &1.end <= end_))
build_token_graph(graph, ts)
end)

s_ents = result["ents"] |> Enum.filter(&(&1["start"] >= start && &1["end"] <= end_))
spacy_sentences_split(rest, result, [{s_text, s_tokens, s_ents} | acc])
end
[_ | pairs] = Enum.zip([nil | sents], sents)

defp add_spacy_entities(%M{} = m, [], _), do: m
# add edge between sentences
graph =
Enum.reduce(pairs, graph, fn {%{"end" => end_}, %{"start" => start}}, graph ->
{t_start, t_end} = find_start_end(graph, start, end_)
Graph.add_edge(graph, t_start, t_end)
end)

defp add_spacy_entities(%M{} = m, ents, %{"text" => text}) do
sequences = Enum.map(ents, &[Token.from_spacy_entity(&1, text)])
add_tokenization(m, sequences)
# add entities
ents = Enum.map(spacy_json["ents"], &Token.from_spacy_entity(&1, text))
graph = add_entities(graph, ents)
%M{text: text, tokenizations: graph}
end

@doc """
Expand All @@ -97,73 +99,132 @@ defmodule BubbleMatch.Sentence do
def add_duckling_entities(%M{} = sentence, []), do: sentence

def add_duckling_entities(%M{} = sentence, entities) do
sequences = Enum.map(entities, &[Token.from_duckling_entity(&1)])
add_tokenization(sentence, sequences)
end
ents = Enum.map(entities, &Token.from_duckling_entity(&1))

@doc false
def add_tokenization(%M{} = m, replace_token_sequences) do
raw_tokens = List.last(m.tokenizations)
graph = add_entities(sentence.tokenizations, ents)
%M{sentence | tokenizations: graph}
end

tokenization =
replace_token_sequences
|> Enum.reduce(raw_tokens, fn seq, toks ->
replace_tokens(toks, seq)
def skip_punct(%M{tokenizations: graph} = m) do
graph =
Enum.reduce(Graph.vertices(graph), graph, fn v, graph ->
connect_punct(graph, v, nil)
end)

tokenizations = both_if_different(no_punct(tokenization), tokenization)
%M{m | tokenizations: Enum.uniq(tokenizations ++ m.tokenizations)}
%{m | tokenizations: graph}
end

defp replace_tokens(token_sequence, replace_tokens) do
# find
start = List.first(replace_tokens).start
end_ = List.last(replace_tokens).end
###

defp connect_punct(graph, v, first) do
case out_vertices(graph, v) |> Enum.split_with(&Token.punct?/1) do
{[], []} ->
graph

{p, []} ->
Enum.reduce(p, graph, fn v2, graph ->
connect_punct(graph, v2, first || v)
end)

{_, vs} ->
if first && not Token.punct?(first) do
Enum.reduce(vs, graph, fn v2, graph ->
Graph.add_edge(graph, first, v2)
end)
else
graph
end
end
end

start_idx = Enum.find_index(token_sequence, &(&1.start == start))
end_idx = Enum.find_index(token_sequence, &(&1.end == end_))
defp new_graph() do
Graph.new(type: :directed)
|> Graph.add_vertices([:start, :end])
end

cond do
start_idx != nil and end_idx != nil and end_idx >= start_idx ->
{a, _} = Enum.split(token_sequence, start_idx)
{_, b} = Enum.split(token_sequence, end_idx + 1)
defp build_token_graph(graph, tokens) do
build_token_graph(graph, tokens, :start)
end

(a ++ replace_tokens ++ b)
|> reindex()
defp build_token_graph(graph, [], _prev) do
graph
end

start_idx != nil and end_idx == nil ->
{a, _} = Enum.split(token_sequence, start_idx)
defp build_token_graph(graph, [last], prev) do
graph
|> Graph.add_vertices([last, prev, :end])
|> Graph.add_edge(prev, last)
|> Graph.add_edge(last, :end)
end

(a ++ replace_tokens)
|> reindex()
defp build_token_graph(graph, [a, b | rest], prev) do
graph
|> Graph.add_vertices([a, b])
|> Graph.add_edge(prev, a)
|> build_token_graph([b | rest], a)
end

start_idx == nil and end_idx != nil ->
{_, b} = Enum.split(token_sequence, end_idx + 1)
defp find_start_end(graph, start, end_) do
t_start =
Graph.vertices(graph)
|> Enum.find(&(is_map(&1) && &1.end == start - 1))

(replace_tokens ++ b)
|> reindex()
t_end =
Graph.vertices(graph)
|> Enum.find(&(is_map(&1) && (&1.start == end_ + 1 || &1.start == end_)))

true ->
# raise RuntimeError, "Token not found at start = #{start}, end = #{end_}"
token_sequence
end
{t_start, t_end}
end

defp reindex(tokens) do
tokens
|> Enum.with_index()
|> Enum.map(fn {t, index} ->
%{t | index: index}
defp add_entities(graph, ents) do
Enum.reduce(ents, graph, fn %{start: start, end: end_} = ent, graph ->
{t_start, t_end} = find_start_end(graph, start, end_)
graph = Graph.add_vertex(graph, ent)

graph =
if t_start do
Graph.add_edge(graph, t_start, ent)
else
Graph.add_edge(graph, :start, ent)
end

graph =
if t_end do
Graph.add_edge(graph, ent, t_end)
else
Graph.add_edge(graph, ent, :end)
end

graph
end)
end

defp both_if_different(a, b, rest \\ [])
defp both_if_different(a, a, rest), do: [a | rest]
defp both_if_different(a, b, rest), do: [a, b | rest]
def out_vertices(graph, vertex) do
Graph.out_edges(graph, vertex)
|> Enum.map(fn e -> e.v2 end)
end

defp no_punct(tokens) do
tokens |> Enum.reject(&Token.punct?/1)
def make_dot(sentence) do
[
"digraph {",
" start[label=\"START\"]",
" end[label=\"END\"]",
for v <- Graph.vertices(sentence.tokenizations), v != :start, v != :end do
" #{vertex_id(v)}[label=\"#{v}\"]"
end,
for e <- Graph.edges(sentence.tokenizations) do
" #{vertex_id(e.v1)} -> #{vertex_id(e.v2)}"
end,
"}"
]
|> List.flatten()
|> Enum.intersperse("\n")
|> IO.chardata_to_string()
end

defp vertex_id(:start), do: "start"
defp vertex_id(:end), do: "end"
defp vertex_id(v), do: "v#{v.index}"
end

defimpl String.Chars, for: BubbleMatch.Sentence do
Expand Down
3 changes: 3 additions & 0 deletions lib/bubble_match/token.ex
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,9 @@ defmodule BubbleMatch.Token do
end

defimpl String.Chars, for: BubbleMatch.Token do
def to_string(%BubbleMatch.Token{type: :entity, raw: raw, value: value}),
do: "#{raw} [#{value.kind}]"

def to_string(%BubbleMatch.Token{raw: raw}), do: raw
end

Expand Down
1 change: 1 addition & 0 deletions mix.exs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ defmodule BubbleMatch.MixProject do
{:nimble_parsec, "~> 0.5.3"},
{:inflex, "~> 2.0"},
{:jason, "~> 1.0"},
{:libgraph, "~> 0.13"},
{:ex_doc, ">= 0.0.0", only: :dev},
{:stream_data, "~> 0.1", only: :test}
]
Expand Down
1 change: 1 addition & 0 deletions mix.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"idna": {:hex, :idna, "6.0.1", "1d038fb2e7668ce41fbf681d2c45902e52b3cb9e9c77b55334353b222c2ee50c", [:rebar3], [{:unicode_util_compat, "0.5.0", [hex: :unicode_util_compat, repo: "hexpm", optional: false]}], "hexpm", "a02c8a1c4fd601215bb0b0324c8a6986749f807ce35f25449ec9e69758708122"},
"inflex": {:hex, :inflex, "2.0.0", "db69d542b8fdb23ac667f9bc0c2395a3983fa2da6ae2efa7ab5dc541928f7a75", [:mix], [], "hexpm", "c018852409bd48b03ad96ed53594186bc074bdd1519043a0ad1fa5697aac4399"},
"jason": {:hex, :jason, "1.2.0", "10043418c42d2493d0ee212d3fddd25d7ffe484380afad769a0a38795938e448", [:mix], [{:decimal, "~> 1.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "116747dbe057794c3a3e4e143b7c8390b29f634e16c78a7f59ba75bfa6852e7f"},
"libgraph": {:hex, :libgraph, "0.13.3", "20732b7bafb933dcf7351c479e03076ebd14a85fd3202c67a1c197f4f7c2466b", [:mix], [], "hexpm", "78f2576eef615440b46f10060b1de1c86640441422832052686df53dc3c148c6"},
"makeup": {:hex, :makeup, "1.0.1", "82f332e461dc6c79dbd82fbe2a9c10d48ed07146f0a478286e590c83c52010b5", [:mix], [{:nimble_parsec, "~> 0.5.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "49736fe5b66a08d8575bf5321d716bac5da20c8e6b97714fec2bcd6febcfa1f8"},
"makeup_elixir": {:hex, :makeup_elixir, "0.14.0", "cf8b7c66ad1cff4c14679698d532f0b5d45a3968ffbcbfd590339cb57742f1ae", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "d4b316c7222a85bbaa2fd7c6e90e37e953257ad196dc229505137c5e505e9eff"},
"match_engine": {:hex, :match_engine, "1.4.5", "d57752c6cc799f5eca89b564a11b3ca0f35253aa4efc9cc35fd1bbb7f35eee42", [:mix], [{:simetric, "~> 0.2.0", [hex: :simetric, repo: "hexpm", optional: false]}, {:timex, "~> 3.1", [hex: :timex, repo: "hexpm", optional: false]}], "hexpm", "95356a6d3bdbf6667f6d2801a97a30dd8a95400b8c5eeb570e3a8e6c86b5373c"},
Expand Down
52 changes: 36 additions & 16 deletions test/bubble_match/sentence_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,39 @@ defmodule BubbleMatch.SentenceTest do

alias BubbleMatch.{Entity, Sentence}

test "tokenize" do
sentence =
Sentence.naive_tokenize("My birthday, is the day after tomorrow, 10 miles away")
|> Sentence.skip_punct()

graph = Sentence.make_dot(sentence)
# IO.puts(graph)
# view_graph(sentence)
assert String.contains?(graph, "start -> v0")
assert String.contains?(graph, "v0 -> v1")
assert String.contains?(graph, "v1 -> v2")
# punct is skipped
assert String.contains?(graph, "v1 -> v3")
end

@spacy_json """
{"ents":[{"end":27,"label":"PERSON","start":21}],"sents":[{"end":9,"start":0},{"end":27,"start":10}],"text":"Hi there. My name is George","tokens":[{"dep":"ROOT","end":2,"head":0,"id":0,"lemma":"hi","norm":"hi","pos":"INTJ","start":0,"string":"Hi ","tag":"UH"},{"dep":"advmod","end":8,"head":0,"id":1,"lemma":"there","norm":"there","pos":"ADV","start":3,"string":"there","tag":"RB"},{"dep":"punct","end":9,"head":0,"id":2,"lemma":".","norm":".","pos":"PUNCT","start":8,"string":". ","tag":"."},{"dep":"poss","end":12,"head":4,"id":3,"lemma":"-PRON-","norm":"my","pos":"DET","start":10,"string":"My ","tag":"PRP$"},{"dep":"nsubj","end":17,"head":5,"id":4,"lemma":"name","norm":"name","pos":"NOUN","start":13,"string":"name ","tag":"NN"},{"dep":"ROOT","end":20,"head":5,"id":5,"lemma":"be","norm":"is","pos":"AUX","start":18,"string":"is ","tag":"VBZ"},{"dep":"attr","end":27,"head":5,"id":6,"lemma":"George","norm":"george","pos":"PROPN","start":21,"string":"George","tag":"NNP"}]}
"""
|> Jason.decode!()

test "from_spacy" do
[hithere, mynameis] = Sentence.sentences_from_spacy(@spacy_json)
sentence = Sentence.from_spacy(@spacy_json)

assert [_, [_, _, _]] = hithere.tokenizations
view_graph(sentence)
# System.cmd("dot", ["-Tpng", "/tmp/x.dot"])
# assert [_, [_, _, _]] = hithere.tokenizations

assert [with_ents, raw_tokens] = mynameis.tokenizations
# assert [with_ents, raw_tokens] = mynameis.tokenizations

assert ~w(my name is george) == Enum.map(raw_tokens, & &1.value["norm"])
assert ~w(spacy spacy spacy entity)a == Enum.map(with_ents, & &1.type)
# assert ~w(my name is george) == Enum.map(raw_tokens, & &1.value["norm"])
# assert ~w(spacy spacy spacy entity)a == Enum.map(with_ents, & &1.type)

assert [_, _, _, %{value: %Entity{value: "George"}}] = with_ents
# assert [_, _, _, %{value: %Entity{value: "George"}}] = with_ents
end

test "match from spacy" do
Expand Down Expand Up @@ -56,6 +73,9 @@ defmodule BubbleMatch.SentenceTest do
sentence =
Sentence.naive_tokenize("My birthday is the day after tomorrow, 10 miles away")
|> Sentence.add_duckling_entities(@duckling_json)
|> Sentence.skip_punct()

view_graph(sentence)

assert [with_ents, with_ents_punct | _] = sentence.tokenizations

Expand Down Expand Up @@ -110,21 +130,21 @@ defmodule BubbleMatch.SentenceTest do
|> Jason.decode!()

@time_spacy """
{"text":"9 p.m.","ents":[],"sents":[{"start":0,"end":1},{"start":2,"end":6}],"tokens":[{"id":0,"start":0,"end":1,"pos":"NUM","tag":"CD","dep":"ROOT","head":0,"string":"9 ","lemma":"9","norm":"9"},{"id":1,"start":2,"end":6,"pos":"NOUN","tag":"NN","dep":"ROOT","head":1,"string":"p.m.","lemma":"p.m.","norm":"p.m."}]}
{"text":"9 p.m.","ents":[],"sents":[{"start":0,"end":9}],"tokens":[{"id":0,"start":0,"end":1,"pos":"NUM","tag":"CD","dep":"ROOT","head":0,"string":"9 ","lemma":"9","norm":"9"},{"id":1,"start":2,"end":6,"pos":"NOUN","tag":"NN","dep":"ROOT","head":1,"string":"p.m.","lemma":"p.m.","norm":"p.m."}]}
"""
|> Jason.decode!()

test "overlapping duckling entities" do
[a, b] = Sentence.sentences_from_spacy(@time_spacy)
Sentence.from_spacy(@time_spacy)
|> Sentence.add_duckling_entities(@time_duckling)
|> Sentence.skip_punct()
|> view_graph()
end

assert [_] = a.tokenizations
a = a |> Sentence.add_duckling_entities(@time_duckling)
assert [with_ents, _raw_tokens] = a.tokenizations
assert List.first(with_ents).value.kind == "time"
defp view_graph(sentence) do
graph = Sentence.make_dot(sentence)

assert [_] = b.tokenizations
b = b |> Sentence.add_duckling_entities(@time_duckling)
assert [with_ents, _raw_tokens] = b.tokenizations
assert List.first(with_ents).value.kind == "time"
File.write!("/tmp/x.dot", graph)
:os.cmd('dot /tmp/x.dot -Tpng > /tmp/x.png; eog /tmp/x.png')
end
end