From c5aa0e05a8f1355daf90c7c5b2b6ddc3d60cdee1 Mon Sep 17 00:00:00 2001 From: John Andrews Date: Wed, 2 Mar 2016 23:34:27 -0500 Subject: [PATCH 1/3] initial cascalog spike --- project.clj | 10 ++-- src/clj/bird_wave/birdlog.clj | 94 +++++++++++++++++++++++++++++++++++ src/clj/bird_wave/import.clj | 1 - 3 files changed, 101 insertions(+), 4 deletions(-) create mode 100644 src/clj/bird_wave/birdlog.clj diff --git a/project.clj b/project.clj index e73fc92..0371f18 100644 --- a/project.clj +++ b/project.clj @@ -3,7 +3,7 @@ :url "http://birdwave.neo.com" :license {:name "Eclipse Public License" :url "http://www.eclipse.org/legal/epl-v10.html"} - :dependencies [[org.clojure/clojure "1.6.0"] + :dependencies [[org.clojure/clojure "1.8.0"] [io.pedestal/pedestal.service "0.2.2"] [io.pedestal/pedestal.service-tools "0.2.2"] [com.datomic/datomic-pro "0.9.4815.12" @@ -16,7 +16,9 @@ [com.cognitect/transit-cljs "0.8.188"] [org.omcljs/om "0.8.7" :as om] [cljsjs/d3 "3.5.3-0"] - [com.cemerick/url "0.1.1"]] + [com.cemerick/url "0.1.1"] + [cascalog/cascalog-core "3.0.0"] + [cascalog/cascalog-more-taps "3.0.0"]] :plugins [[lein-cljsbuild "1.0.3"] [datomic-schema-grapher "0.0.1"] [ohpauleez/lein-pedestal "0.1.0-beta10"]] @@ -77,7 +79,9 @@ :profiles {:dev {:dependencies [[io.pedestal/pedestal.jetty "0.2.2"] [datomic-schema-grapher "0.0.1"] [ankha "0.1.1"] - [org.clojure/tools.namespace "0.2.4"]] + [org.clojure/tools.namespace "0.2.4"] + [org.apache.hadoop/hadoop-core "1.2.1"]] + :jvm-opts ["-Xms768m" "-Xmx768m"] :source-paths ["dev/clj"]}} diff --git a/src/clj/bird_wave/birdlog.clj b/src/clj/bird_wave/birdlog.clj new file mode 100644 index 0000000..cbcad39 --- /dev/null +++ b/src/clj/bird_wave/birdlog.clj @@ -0,0 +1,94 @@ +(ns bird-wave.birdlog + (:require [cascalog.api :refer :all] + [cascalog.more-taps :as taps] + [clojure.string :as cs] + [clojure.instant :as inst])) + +(def field-positions + [:sighting/guid ;; "GLOBAL UNIQUE IDENTIFIER" + :taxon/order ;; "TAXONOMIC ORDER" + nil ;; "CATEGORY" + :taxon/common-name ;; "COMMON NAME" + :taxon/scientific-name ;; "SCIENTIFIC NAME" + :taxon/subspecies-common-name ;; "SUBSPECIES COMMON NAME" + :taxon/subspecies-scientific-name;; "SUBSPECIES SCIENTIFIC NAME" + :sighting/count ;; "OBSERVATION COUNT" ;; x indicates uncounted + nil ;; "BREEDING BIRD ATLAS CODE" + nil ;; "AGE/SEX" + nil ;; "COUNTRY" + nil ;; "COUNTRY_CODE" + :sighting/state ;; "STATE_PROVINCE" + :sighting/state-code ;; "SUBNATIONAL1_CODE" + :sighting/county ;; "COUNTY" + :sighting/county-code ;; "SUBNATIONAL2_CODE" + nil ;; "IBA CODE" + :sighting/locality ;; "LOCALITY" + nil ;; "LOCALITY ID" + nil ;; "LOCALITY TYPE" + :sighting/latitude ;; "LATITUDE" + :sighting/longitude ;; "LONGITUDE" + :sighting/date ;; "OBSERVATION DATE" + nil ;; "TIME OBSERVATIONS STARTED" + nil ;; "TRIP COMMENTS" + nil ;; "SPECIES COMMENTS" + nil ;; "OBSERVER ID" + nil ;; "FIRST NAME" + nil ;; "LAST NAME" + nil ;; "SAMPLING EVENT IDENTIFIER" + nil ;; "PROTOCOL TYPE" + nil ;; "PROJECT CODE" + nil ;; "DURATION MINUTES" + nil ;; "EFFORT DISTANCE KM" + nil ;; "EFFORT AREA HA" + nil ;; "NUMBER OBSERVERS" + nil ;; "ALL SPECIES REPORTED" + nil ;; "GROUP IDENTIFIER" + nil ;; "APPROVED" + nil ;; "REVIEWED" + nil ;; "REASON" + ]) + +(def fields (remove nil? fields)) + +(defn sighting + "given a line of text, split on tabs and return the fields we care about + (indicated by non-nil presence in fields vector)" + [plaintext-row] + (into {} + (remove nil? + (map #(if % [% %2]) + field-positions + (cs/split plaintext-row #"\t"))))) + +(defn coerce [m f & [key & keys]] + (if key + (recur (assoc m key (f (get m key))) f keys) + m)) + +(defn parse-count [s] + (if (= "X" s) 1 (Long/parseLong s))) + +(defn coerce-values [sighting] + (let [date (inst/read-instant-date (:sighting/date sighting)) ] + (-> sighting + (coerce bigdec + :sighting/latitude + :sighting/longitude) + (coerce parse-count + :sighting/count) + (assoc :sighting/date date) + (assoc :sighting/month-yr (format "%tY/%tm" date date)) + ))) + +(defn ordered-values + "return values of fields in column order" + [sighting] + (map sighting fields)) + +(defmapop parse-line [line] + (ordered-values (coerce-values (sighting line)))) + +(?- (stdout) + (<- [?common-name ?count] + ((hfs-textline "sample_data/birds.txt") ?line) + (parse-line ?line :> ?guid ?order ?common-name ?scientific-name ?subspecies-common-name ?subspecies-scientific-name ?count ?state ?state-code ?county ?county-code ?locality ?latitude ?longitude ?date))) diff --git a/src/clj/bird_wave/import.clj b/src/clj/bird_wave/import.clj index 927bf2e..b98dda2 100644 --- a/src/clj/bird_wave/import.clj +++ b/src/clj/bird_wave/import.clj @@ -5,7 +5,6 @@ [datomic.api :as d])) (def fields - ;; TODO: consider including guid as a primary key for sighting [:sighting/guid ;; "GLOBAL UNIQUE IDENTIFIER" :taxon/order ;; "TAXONOMIC ORDER" nil ;; "CATEGORY" From 964ac618ca03f508473b39cb4eaebe393fa04a78 Mon Sep 17 00:00:00 2001 From: John Andrews Date: Wed, 2 Mar 2016 23:48:37 -0500 Subject: [PATCH 2/3] Tweak to remove header row (fix me) --- sample_data/birds.txt | 1 - src/clj/bird_wave/birdlog.clj | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/sample_data/birds.txt b/sample_data/birds.txt index 89282ec..91f18c3 100644 --- a/sample_data/birds.txt +++ b/sample_data/birds.txt @@ -1,4 +1,3 @@ -˙GLOBAL UNIQUE IDENTIFIER TAXONOMIC ORDER CATEGORY COMMON NAME SCIENTIFIC NAME SUBSPECIES COMMON NAME SUBSPECIES SCIENTIFIC NAME OBSERVATION COUNT BREEDING BIRD ATLAS CODE AGE/SEX COUNTRY COUNTRY_CODE STATE_PROVINCE SUBNATIONAL1_CODE COUNTY SUBNATIONAL2_CODE IBA CODE LOCALITY LOCALITY ID LOCALITY TYPE LATITUDE LONGITUDE OBSERVATION DATE TIME OBSERVATIONS STARTED TRIP COMMENTS SPECIES COMMENTS OBSERVER ID FIRST NAME LAST NAME SAMPLING EVENT IDENTIFIER PROTOCOL TYPE PROJECT CODE DURATION MINUTES EFFORT DISTANCE KM EFFORT AREA HA NUMBER OBSERVERS ALL SPECIES REPORTED GROUP IDENTIFIER APPROVED REVIEWED REASON URN:CornellLabOfOrnithology:EBIRD:OBS175897027 21475 species American Dipper Cinclus mexicanus 4 United States US Alaska US-AK Aleutians West US-AK-016 US-AK_4441 Captain's Bay L1875942 P 53.8412816 -166.5905365 2012-12-29 23:04:00 These are the birds counted while covering my section (Captain's Bay) of the Unalaska Island Count circle for the 113th CBC. I covered four miles by road and 1.5 miles (3 roundtrip) by foot. Temps 35 to 38 F, drizzle turning to heavy rain as the day went on, overcast with light NE winds, fresh water partly frozen. 6"to 12" snow depth. obs280983 Suzi Golodoff S12544775 eBird - Traveling Count EBIRD 300 8.851 1 1 1 0 URN:CornellLabOfOrnithology:EBIRD:OBS175897003 29709 species American Tree Sparrow Spizella arborea 1 United States US Alaska US-AK Aleutians West US-AK-016 US-AK_4441 Captain's Bay L1875942 P 53.8412816 -166.5905365 2012-12-29 23:04:00 These are the birds counted while covering my section (Captain's Bay) of the Unalaska Island Count circle for the 113th CBC. I covered four miles by road and 1.5 miles (3 roundtrip) by foot. Temps 35 to 38 F, drizzle turning to heavy rain as the day went on, overcast with light NE winds, fresh water partly frozen. 6"to 12" snow depth. A casual/accidental species in Unalaska. There are three individuals in the area now ( first week January 2013). Two showed up in Captain's Bay on December 15th, 2012 ( relocated for the CBC) and a third has been frequenting a local feeder. I have photos of that bird. obs280983 Suzi Golodoff S12544775 eBird - Traveling Count EBIRD 300 8.851 1 1 1 1 URN:CornellLabOfOrnithology:EBIRD:OBS175897012 2881 species Bald Eagle Haliaeetus leucocephalus 90 United States US Alaska US-AK Aleutians West US-AK-016 US-AK_4441 Captain's Bay L1875942 P 53.8412816 -166.5905365 2012-12-29 23:04:00 These are the birds counted while covering my section (Captain's Bay) of the Unalaska Island Count circle for the 113th CBC. I covered four miles by road and 1.5 miles (3 roundtrip) by foot. Temps 35 to 38 F, drizzle turning to heavy rain as the day went on, overcast with light NE winds, fresh water partly frozen. 6"to 12" snow depth. obs280983 Suzi Golodoff S12544775 eBird - Traveling Count EBIRD 300 8.851 1 1 1 0 diff --git a/src/clj/bird_wave/birdlog.clj b/src/clj/bird_wave/birdlog.clj index cbcad39..52d9174 100644 --- a/src/clj/bird_wave/birdlog.clj +++ b/src/clj/bird_wave/birdlog.clj @@ -48,7 +48,7 @@ nil ;; "REASON" ]) -(def fields (remove nil? fields)) +(def fields (remove nil? field-positions)) (defn sighting "given a line of text, split on tabs and return the fields we care about @@ -81,7 +81,7 @@ ))) (defn ordered-values - "return values of fields in column order" + "return sighting values in the order they appear in field-positions" [sighting] (map sighting fields)) From 954995665abe1b2a0617d349b379a39d4f34e9c5 Mon Sep 17 00:00:00 2001 From: John Andrews Date: Sat, 5 Mar 2016 18:49:08 -0500 Subject: [PATCH 3/3] Spike out similar ideas in PigPen --- project.clj | 4 +- src/clj/bird_wave/birdlog.clj | 25 +++++-- src/clj/bird_wave/pigpen.clj | 131 ++++++++++++++++++++++++++++++++++ 3 files changed, 154 insertions(+), 6 deletions(-) create mode 100644 src/clj/bird_wave/pigpen.clj diff --git a/project.clj b/project.clj index 0371f18..d2497ab 100644 --- a/project.clj +++ b/project.clj @@ -18,7 +18,8 @@ [cljsjs/d3 "3.5.3-0"] [com.cemerick/url "0.1.1"] [cascalog/cascalog-core "3.0.0"] - [cascalog/cascalog-more-taps "3.0.0"]] + [cascalog/cascalog-more-taps "3.0.0"] + [com.netflix.pigpen/pigpen-pig "0.3.2"]] :plugins [[lein-cljsbuild "1.0.3"] [datomic-schema-grapher "0.0.1"] [ohpauleez/lein-pedestal "0.1.0-beta10"]] @@ -80,6 +81,7 @@ [datomic-schema-grapher "0.0.1"] [ankha "0.1.1"] [org.clojure/tools.namespace "0.2.4"] + [org.apache.pig/pig "0.13.0"] [org.apache.hadoop/hadoop-core "1.2.1"]] :jvm-opts ["-Xms768m" "-Xmx768m"] :source-paths ["dev/clj"]}} diff --git a/src/clj/bird_wave/birdlog.clj b/src/clj/bird_wave/birdlog.clj index 52d9174..3ca6afa 100644 --- a/src/clj/bird_wave/birdlog.clj +++ b/src/clj/bird_wave/birdlog.clj @@ -1,5 +1,6 @@ (ns bird-wave.birdlog (:require [cascalog.api :refer :all] + [cascalog.logic.ops :as c] [cascalog.more-taps :as taps] [clojure.string :as cs] [clojure.instant :as inst])) @@ -48,7 +49,8 @@ nil ;; "REASON" ]) -(def fields (remove nil? field-positions)) +(def fields (concat (remove nil? field-positions) + [:sighting/month-yr])) (defn sighting "given a line of text, split on tabs and return the fields we care about @@ -88,7 +90,20 @@ (defmapop parse-line [line] (ordered-values (coerce-values (sighting line)))) -(?- (stdout) - (<- [?common-name ?count] - ((hfs-textline "sample_data/birds.txt") ?line) - (parse-line ?line :> ?guid ?order ?common-name ?scientific-name ?subspecies-common-name ?subspecies-scientific-name ?count ?state ?state-code ?county ?county-code ?locality ?latitude ?longitude ?date))) +(comment + (taps/hfs-delimited "sample_data/out/" + :delimiter "," + :classes false + :write-header? true + :quote true + :sink-template "%s/%s" + :templatefields ["order" "month-yr"]) + + + (?- (hfs-textline "sample_data/out/" :sink-template "%s/%s" :templatefields ["?order" "?month-yr"]) + (<- [?order ?state ?county ?month-yr ?total] + ((hfs-textline "sample_data/birds.txt") ?line) + (parse-line ?line :> ?guid ?order ?common-name ?scientific-name ?subspecies-common-name ?subspecies-scientific-name ?count ?state ?state-code ?county ?county-code ?locality ?latitude ?longitude ?date ?month-yr) + (c/sum ?count :> ?total))) + + ) diff --git a/src/clj/bird_wave/pigpen.clj b/src/clj/bird_wave/pigpen.clj new file mode 100644 index 0000000..3736812 --- /dev/null +++ b/src/clj/bird_wave/pigpen.clj @@ -0,0 +1,131 @@ +(ns bird-wave.pigpen + (:require [pigpen.core :as pig] + [pigpen.fold :as fold] + [clojure.string :as cs] + [clojure.instant :as inst])) + +(defn birds-file + [filename] + (pig/load-tsv filename)) + +(def field-positions + [:sighting/guid ;; "GLOBAL UNIQUE IDENTIFIER" + :taxon/order ;; "TAXONOMIC ORDER" + nil ;; "CATEGORY" + :taxon/common-name ;; "COMMON NAME" + :taxon/scientific-name ;; "SCIENTIFIC NAME" + :taxon/subspecies-common-name ;; "SUBSPECIES COMMON NAME" + :taxon/subspecies-scientific-name;; "SUBSPECIES SCIENTIFIC NAME" + :sighting/count ;; "OBSERVATION COUNT" ;; x indicates uncounted + nil ;; "BREEDING BIRD ATLAS CODE" + nil ;; "AGE/SEX" + nil ;; "COUNTRY" + nil ;; "COUNTRY_CODE" + :sighting/state ;; "STATE_PROVINCE" + :sighting/state-code ;; "SUBNATIONAL1_CODE" + :sighting/county ;; "COUNTY" + :sighting/county-code ;; "SUBNATIONAL2_CODE" + nil ;; "IBA CODE" + :sighting/locality ;; "LOCALITY" + nil ;; "LOCALITY ID" + nil ;; "LOCALITY TYPE" + :sighting/latitude ;; "LATITUDE" + :sighting/longitude ;; "LONGITUDE" + :sighting/date ;; "OBSERVATION DATE" + nil ;; "TIME OBSERVATIONS STARTED" + nil ;; "TRIP COMMENTS" + nil ;; "SPECIES COMMENTS" + nil ;; "OBSERVER ID" + nil ;; "FIRST NAME" + nil ;; "LAST NAME" + nil ;; "SAMPLING EVENT IDENTIFIER" + nil ;; "PROTOCOL TYPE" + nil ;; "PROJECT CODE" + nil ;; "DURATION MINUTES" + nil ;; "EFFORT DISTANCE KM" + nil ;; "EFFORT AREA HA" + nil ;; "NUMBER OBSERVERS" + nil ;; "ALL SPECIES REPORTED" + nil ;; "GROUP IDENTIFIER" + nil ;; "APPROVED" + nil ;; "REVIEWED" + nil ;; "REASON" + ]) + +(def fields (concat (remove nil? field-positions) + [:sighting/month-yr])) + +(defn sighting + "given a line of text, split on tabs and return the fields we care about + (indicated by non-nil presence in fields vector)" + [row] + (->> row + (map #(if % [% %2]) field-positions) + (remove nil?) + (into {}))) + +(defn coerce [m f & [key & keys]] + (if key + (recur (assoc m key (f (get m key))) f keys) + m)) + +(defn parse-count [s] + (if (= "X" s) 1 (Long/parseLong s))) + +(defn coerce-values [sighting] + (let [date (inst/read-instant-date (:sighting/date sighting)) ] + (-> sighting + (coerce bigdec + :sighting/latitude + :sighting/longitude) + (coerce parse-count + :sighting/count) + (assoc :sighting/date date) + (assoc :sighting/month-yr (format "%tY/%tm" date date)) + ))) + +(defn birds-by-us-county + [data] + (->> data + (pig/map + (fn [row] + (-> row + (sighting) + (coerce-values)))) + (pig/group-by (juxt :taxon/order :sighting/month-yr :sighting/state :sighting/county) + {:fold (fold/juxt (fold/count) + (->> (fold/map :sighting/count) (fold/sum)) + (->> (fold/map :sighting/count) (fold/avg)))}) + (pig/map flatten))) + +(defn species + [data] + (->> data + (pig/map + (fn [row] + (-> row + (sighting) + (coerce-values)))) + (pig/group-by (juxt :taxon/order :taxon/common-name) + {:fold (fold/count)}) + (pig/map flatten) + ;; (pig/filter (fn [x] (< 1 (last x)))) + )) + +(defn transform-birds + [input-file output-file] + (->> (birds-file input-file) + (birds-by-us-county) + (pig/store-clj output-file))) + +(comment + + (pig/dump (->> "sample_data/birds.txt" + (birds-file) + (birds-by-us-county))) + + (pig/dump (->> "sample_data/birds.txt" + (birds-file) + (species))) + + )