diff --git a/.gitignore b/.gitignore index b3d43a6f7..ec317c828 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ coverage/ /lc_browseview_cache.json /lc_cache.json /linkedcat.sqlite +.env \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index c5187ce42..39fe13302 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,126 +2,184 @@ version: '3.7' services: - pgsql: + db: image: 'postgres:12.2-alpine' + hostname: "${POSTGRES_HOSTNAME}" restart: always environment: POSTGRES_USER: "${POSTGRES_USER}" POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}" + command: postgres -c config_file=/etc/postgresql.conf -c hba_file=/etc/pg_hba.conf volumes: - - db-data:/var/lib/postgresql/data - - ./server/workers/pg_hba.conf:/var/lib/postgresql/data/pg_hba.conf - - ./server/workers/postgresql.conf:/var/lib/postgresql/data/postgresql.conf - ports: - - '127.0.0.1:5432:5432' + # - ~/data/OKMaps/${COMPOSE_PROJECT_NAME}/postgresql/data:/var/lib/postgresql/data + - db_data:/var/lib/postgresql/data + - ./server/workers/pg_hba.conf:/etc/pg_hba.conf + - ./server/workers/postgresql.conf:/etc/postgresql.conf + networks: + - headstart pgadmin: image: 'dpage/pgadmin4' ports: - '127.0.0.1:54323:80' - env_file: - - .env - - api: - build: - context: server - dockerfile: services.docker - restart: always - depends_on: - - redis - network_mode: "host" - depends_on: - - redis + networks: + - headstart redis: image: 'redis:4.0-alpine' restart: always - command: ["redis-server", "/etc/redis/redis.conf", "--appendonly", "yes"] + hostname: "${REDIS_HOST}" + environment: + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + command: ["redis-server", "/etc/redis/redis.conf", "--bind", "${REDIS_HOST}", "--appendonly", "yes", "--port", "${REDIS_PORT}"] volumes: - - 'redis:/var/lib/redis/data' - - ./server/workers/redis.conf:/etc/redis/redis.conf + - 'redis:/var/lib/redis/data' + - ./server/workers/redis.conf:/etc/redis/redis.conf + restart: always + networks: + - headstart + + api: + image: api:${SERVICE_VERSION} restart: always - network_mode: "host" + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" + REDIS_DB: "${REDIS_DB}" + command: ["gunicorn", "--workers", "10", "--threads", "2", "-b", "0.0.0.0:${API_PORT}", "app:app", "--timeout", "300"] + depends_on: + - redis + networks: + - headstart - search_triple: - build: - context: server - dockerfile: search_triple.docker + persistence: + image: persistence:${SERVICE_VERSION} + restart: always + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + POSTGRES_USER: "${POSTGRES_USER}" + POSTGRES_PASSWORD: "${POSTGRES_PASSWORD}" + POSTGRES_HOST: "${POSTGRES_HOST}" + POSTGRES_PORT: "${POSTGRES_PORT}" + command: ["gunicorn", "--workers", "10", "--threads", "2", "-b", "0.0.0.0:${API_PORT}", "app:app", "--timeout", "300"] + networks: + - headstart + + triple: + image: triple:${SERVICE_VERSION} env_file: - server/workers/triple/triple.env + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" restart: always - network_mode: "host" depends_on: - redis + networks: + - headstart - search_gsheets: - build: - context: server - dockerfile: search_gsheets.docker + gsheets: + image: gsheets:${SERVICE_VERSION} env_file: - server/workers/gsheets/gsheets.env + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" restart: always - network_mode: "host" depends_on: - redis + networks: + - headstart dataprocessing: - build: - context: server - dockerfile: dataprocessing.docker + image: dataprocessing:${SERVICE_VERSION} env_file: - - server/workers/dataprocessing/dataprocessing.env + - server/workers/dataprocessing/dataprocessing.env + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" restart: always - network_mode: "host" volumes: - /opt/local/renv/cache:/renv/cache - /var/log/headstart:/var/log/headstart depends_on: - redis + networks: + - headstart - search_base: - build: - context: server - dockerfile: base.docker + base: + image: base:${SERVICE_VERSION} env_file: - server/workers/base/base.env + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" restart: always - network_mode: "host" volumes: - /opt/local/renv/cache:/renv/cache - /var/log/headstart:/var/log/headstart depends_on: - redis + networks: + - headstart - search_pubmed: - build: - context: server - dockerfile: pubmed.docker + pubmed: + image: pubmed:${SERVICE_VERSION} env_file: - server/workers/pubmed/pubmed.env + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" restart: always - network_mode: "host" volumes: - /opt/local/renv/cache:/renv/cache - /var/log/headstart:/var/log/headstart depends_on: - redis + networks: + - headstart - search_openaire: - build: - context: server - dockerfile: openaire.docker + openaire: + image: openaire:${SERVICE_VERSION} env_file: - server/workers/openaire/openaire.env + environment: + SERVICE_VERSION: "${SERVICE_VERSION}" + REDIS_HOST: "${REDIS_HOST}" + REDIS_PORT: "${REDIS_PORT}" + REDIS_DB: "${REDIS_DB}" + REDIS_PASSWORD: "${REDIS_PASSWORD}" restart: always - network_mode: "host" volumes: - /opt/local/renv/cache:/renv/cache - /var/log/headstart:/var/log/headstart depends_on: - redis + networks: + - headstart volumes: redis: - db-data: + db_data: driver: local + +networks: + headstart: \ No newline at end of file diff --git a/docker-compose_win.yml b/docker-compose_win.yml deleted file mode 100644 index 8cd7f6e45..000000000 --- a/docker-compose_win.yml +++ /dev/null @@ -1,42 +0,0 @@ -version: '3.7' - -services: - - api: - build: - context: server - dockerfile: services.docker - restart: always - ports: - - '127.0.0.1:5001:5001' - depends_on: - - redis - - redis: - image: 'redis:4.0-alpine' - restart: always - command: ["redis-server", "/etc/redis/redis.conf", "--appendonly", "yes"] - volumes: - - ./server/workers/redis.conf:/etc/redis/redis.conf - restart: always - ports: - - '127.0.0.1:6379:6379' - - search_triple: - build: - context: server - dockerfile: search_triple.docker - env_file: - - server/workers/triple/triple.env - restart: always - - dataprocessing: - build: - context: server - dockerfile: dataprocessing.docker - env_file: - - server/workers/dataprocessing/dataprocessing.env - restart: always - -volumes: - redis: diff --git a/example.env b/example.env deleted file mode 100644 index ee2241b1d..000000000 --- a/example.env +++ /dev/null @@ -1,6 +0,0 @@ -POSTGRES_DB=postgres -POSTGRES_USER=headstart -POSTGRES_PASSWORD=password -PGADMIN_DEFAULT_EMAIL=email@domain.org -PGADMIN_DEFAULT_PASSWORD=password -PGADMIN_VOLUME=/path/to/pgadminworkingdir diff --git a/server/classes/headstart/library/APIClient.php b/server/classes/headstart/library/APIClient.php new file mode 100644 index 000000000..01fd6ddcd --- /dev/null +++ b/server/classes/headstart/library/APIClient.php @@ -0,0 +1,49 @@ +load_configs($ini_array); + } + + public function load_configs($ini_array) { + $this->ini_array = $ini_array; + $this->settings = $this->ini_array["general"]; + $this->processing_backend = isset($this->ini_array["general"]["processing_backend"]) + ? ($this->ini_array["general"]["processing_backend"]) + : "legacy"; + $this->persistence_backend = isset($this->ini_array["general"]["persistence_backend"]) + ? ($this->ini_array["general"]["persistence_backend"]) + : "legacy"; + $this->database = $this->ini_array["connection"]["database"]; + $this->WORKING_DIR = $this->ini_array["general"]["preprocessing_dir"] . $this->ini_array["output"]["output_dir"]; + $api_url = $this->ini_array["general"]["api_url"]; + $api_flavor = isset($this->ini_array["general"]["api_flavor"]) + ? ($this->ini_array["general"]["api_flavor"]) + : "stable"; + $this->base_route = $api_url . $api_flavor . "/"; + } + + public function call_api($endpoint, $payload) { + $route = $this->base_route . $endpoint; + $res = CommUtils::call_api($route, $payload); + if ($res["httpcode"] != 200) { + $res["route"] = $route; + } + return $res; + } + + public function call_persistence($endpoint, $payload) { + $route = $this->base_route . "persistence/" . $endpoint . "/" . $this->database; + $res = CommUtils::call_api($route, $payload); + if ($res["httpcode"] != 200) { + $res["route"] = $route; + } + return $res; + } + +} \ No newline at end of file diff --git a/server/classes/headstart/library/Inflector.php b/server/classes/headstart/library/Inflector.php deleted file mode 100644 index b2f3c0915..000000000 --- a/server/classes/headstart/library/Inflector.php +++ /dev/null @@ -1,374 +0,0 @@ - '\1zes', - '/^(ox)$/i' => '\1en', - '/([m|l])ouse$/i' => '\1ice', - '/(matr|vert|ind)ix|ex$/i' => '\1ices', - '/(x|ch|ss|sh)$/i' => '\1es', - '/([^aeiouy]|qu)ies$/i' => '\1y', - '/([^aeiouy]|qu)y$/i' => '\1ies', - '/(hive)$/i' => '\1s', - '/(?:([^f])fe|([lr])f)$/i' => '\1\2ves', - '/sis$/i' => 'ses', - '/([ti])um$/i' => '\1a', - '/(buffal|tomat)o$/i' => '\1oes', - '/(bu)s$/i' => '\1ses', - '/(alias|status)/i'=> '\1es', - '/(octop|vir)us$/i'=> '\1i', - '/(ax|test)is$/i'=> '\1es', - '/s$/i'=> 's', - '/$/'=> 's'); - - $uncountable = array('equipment', 'information', 'rice', 'money', 'species', 'series', 'fish', 'sheep'); - - $irregular = array( - 'person' => 'people', - 'man' => 'men', - 'child' => 'children', - 'sex' => 'sexes', - 'move' => 'moves'); - - $lowercased_word = strtolower($word); - - foreach ($uncountable as $_uncountable){ - if(substr($lowercased_word,(-1*strlen($_uncountable))) == $_uncountable){ - return $word; - } - } - - foreach ($irregular as $_plural=> $_singular){ - if (preg_match('/('.$_plural.')$/i', $word, $arr)) { - return preg_replace('/('.$_plural.')$/i', substr($arr[0],0,1).substr($_singular,1), $word); - } - } - - foreach ($plural as $rule => $replacement) { - if (preg_match($rule, $word)) { - return preg_replace($rule, $replacement, $word); - } - } - return false; - - } - - // }}} - // {{{ singularize() - - /** - * Singularizes English nouns. - * - * @access public - * @static - * @param string $word English noun to singularize - * @return string Singular noun. - */ - static function singularize($word) - { - $singular = array ( - '/(quiz)zes$/i' => '\1', - '/(matr)ices$/i' => '\1ix', - '/(vert|ind)ices$/i' => '\1ex', - '/^(ox)en/i' => '\1', - '/(alias|status)es$/i' => '\1', - '/([octop|vir])i$/i' => '\1us', - '/(cris|ax|test)es$/i' => '\1is', - '/(shoe)s$/i' => '\1', - '/(o)es$/i' => '\1', - '/(bus)es$/i' => '\1', - '/([m|l])ice$/i' => '\1ouse', - '/(x|ch|ss|sh)es$/i' => '\1', - '/(m)ovies$/i' => '\1ovie', - '/(s)eries$/i' => '\1eries', - '/([^aeiouy]|qu)ies$/i' => '\1y', - '/([lr])ves$/i' => '\1f', - '/(tive)s$/i' => '\1', - '/(hive)s$/i' => '\1', - '/([^f])ves$/i' => '\1fe', - '/(^analy)ses$/i' => '\1sis', - '/((a)naly|(b)a|(d)iagno|(p)arenthe|(p)rogno|(s)ynop|(t)he)ses$/i' => '\1\2sis', - '/([ti])a$/i' => '\1um', - '/(n)ews$/i' => '\1ews', - '/s$/i' => '', - ); - - $uncountable = array('equipment', 'information', 'rice', 'money', 'species', 'series', 'fish', 'sheep'); - - $irregular = array( - 'person' => 'people', - 'man' => 'men', - 'child' => 'children', - 'sex' => 'sexes', - 'move' => 'moves'); - - $lowercased_word = strtolower($word); - foreach ($uncountable as $_uncountable){ - if(substr($lowercased_word,(-1*strlen($_uncountable))) == $_uncountable){ - return $word; - } - } - - foreach ($irregular as $_plural=> $_singular){ - if (preg_match('/('.$_singular.')$/i', $word, $arr)) { - return preg_replace('/('.$_singular.')$/i', substr($arr[0],0,1).substr($_plural,1), $word); - } - } - - foreach ($singular as $rule => $replacement) { - if (preg_match($rule, $word)) { - return preg_replace($rule, $replacement, $word); - } - } - - return $word; - } - - // }}} - // {{{ titleize() - - /** - * Converts an underscored or CamelCase word into a English - * sentence. - * - * The titleize function converts text like "WelcomePage", - * "welcome_page" or "welcome page" to this "Welcome - * Page". - * If second parameter is set to 'first' it will only - * capitalize the first character of the title. - * - * @access public - * @static - * @param string $word Word to format as tile - * @param string $uppercase If set to 'first' it will only uppercase the - * first character. Otherwise it will uppercase all - * the words in the title. - * @return string Text formatted as title - */ - function titleize($word, $uppercase = '') - { - $uppercase = $uppercase == 'first' ? 'ucfirst' : 'ucwords'; - return $uppercase(Inflector::humanize(Inflector::underscore($word))); - } - - // }}} - // {{{ camelize() - - /** - * Returns given word as CamelCased - * - * Converts a word like "send_email" to "SendEmail". It - * will remove non alphanumeric character from the word, so - * "who's online" will be converted to "WhoSOnline" - * - * @access public - * @static - * @see variablize - * @param string $word Word to convert to camel case - * @return string UpperCamelCasedWord - */ - function camelize($word) - { - return str_replace(' ','',ucwords(preg_replace('/[^A-Z^a-z^0-9]+/',' ',$word))); - } - - // }}} - // {{{ underscore() - - /** - * Converts a word "into_it_s_underscored_version" - * - * Convert any "CamelCased" or "ordinary Word" into an - * "underscored_word". - * - * This can be really useful for creating friendly URLs. - * - * @access public - * @static - * @param string $word Word to underscore - * @return string Underscored word - */ - function underscore($word) - { - return strtolower(preg_replace('/[^A-Z^a-z^0-9]+/','_', - preg_replace('/([a-zd])([A-Z])/','1_2', - preg_replace('/([A-Z]+)([A-Z][a-z])/','1_2',$word)))); - } - - // }}} - // {{{ humanize() - - /** - * Returns a human-readable string from $word - * - * Returns a human-readable string from $word, by replacing - * underscores with a space, and by upper-casing the initial - * character by default. - * - * If you need to uppercase all the words you just have to - * pass 'all' as a second parameter. - * - * @access public - * @static - * @param string $word String to "humanize" - * @param string $uppercase If set to 'all' it will uppercase all the words - * instead of just the first one. - * @return string Human-readable word - */ - function humanize($word, $uppercase = '') - { - $uppercase = $uppercase == 'all' ? 'ucwords' : 'ucfirst'; - return $uppercase(str_replace('_',' ',preg_replace('/_id$/', '',$word))); - } - - // }}} - // {{{ variablize() - - /** - * Same as camelize but first char is underscored - * - * Converts a word like "send_email" to "sendEmail". It - * will remove non alphanumeric character from the word, so - * "who's online" will be converted to "whoSOnline" - * - * @access public - * @static - * @see camelize - * @param string $word Word to lowerCamelCase - * @return string Returns a lowerCamelCasedWord - */ - function variablize($word) - { - $word = Inflector::camelize($word); - return strtolower($word[0]).substr($word,1); - } - - // }}} - // {{{ tableize() - - /** - * Converts a class name to its table name according to rails - * naming conventions. - * - * Converts "Person" to "people" - * - * @access public - * @static - * @see classify - * @param string $class_name Class name for getting related table_name. - * @return string plural_table_name - */ - function tableize($class_name) - { - return Inflector::pluralize(Inflector::underscore($class_name)); - } - - // }}} - // {{{ classify() - - /** - * Converts a table name to its class name according to rails - * naming conventions. - * - * Converts "people" to "Person" - * - * @access public - * @static - * @see tableize - * @param string $table_name Table name for getting related ClassName. - * @return string SingularClassName - */ - function classify($table_name) - { - return Inflector::camelize(Inflector::singularize($table_name)); - } - - // }}} - // {{{ ordinalize() - - /** - * Converts number to its ordinal English form. - * - * This method converts 13 to 13th, 2 to 2nd ... - * - * @access public - * @static - * @param integer $number Number to get its ordinal value - * @return string Ordinal representation of given string. - */ - function ordinalize($number) - { - if (in_array(($number % 100),range(11,13))){ - return $number.'th'; - }else{ - switch (($number % 10)) { - case 1: - return $number.'st'; - break; - case 2: - return $number.'nd'; - break; - case 3: - return $number.'rd'; - default: - return $number.'th'; - break; - } - } - } - - // }}} - -} - -?> diff --git a/server/classes/headstart/library/toolkit.php b/server/classes/headstart/library/toolkit.php index 81dadccc6..3482798df 100644 --- a/server/classes/headstart/library/toolkit.php +++ b/server/classes/headstart/library/toolkit.php @@ -2,8 +2,6 @@ namespace headstart\library; -include "Inflector.php"; - class Toolkit { private static $initialized = false; @@ -137,19 +135,6 @@ public static function loadIni($path) { return $ini_array; } - public static function normalizeString($old_string) { - self::initialize(); - - $new_string = preg_replace('/[^a-zA-Z0-9\s]/', '', strtolower($old_string)); - $new_string_array = preg_split('/ /', $new_string); - $final_string = ""; - foreach ($new_string_array as $string) { - $final_string .= \Inflector::singularize($string) . " "; - } - - return trim($final_string); - } - public static function isJson($string) { self::initialize(); diff --git a/server/classes/headstart/personalization/DBConnectionPersonalization.php b/server/classes/headstart/personalization/DBConnectionPersonalization.php deleted file mode 100644 index 5ecc90cff..000000000 --- a/server/classes/headstart/personalization/DBConnectionPersonalization.php +++ /dev/null @@ -1,119 +0,0 @@ -db); - - $data = array(); - - - while($row = mysql_fetch_assoc($result)) { - $data[] = $row; - } - - return $data; - - } - - public function getPersonalRecommendations($user_id, $event_id, $max_recommendations) { - - $query = sprintf("SELECT DISTINCT content.contentID - FROM predictedscore, content - WHERE (predictedscore.contentID=content.contentID) - AND (predictedscore.userID = %d)", mysql_real_escape_string($user_id)); - - $query .= $this->createConferenceIDString($event_id); - - $query .= sprintf("order by score desc limit 0, %d", mysql_real_escape_string($max_recommendations)); - - $result = mysql_query($query, $this->db); - - $data = array(); - - if($result) { - while($row = mysql_fetch_assoc($result)) { - $data[] = $row; - } - } - - return $data; - - } - - public function addPersonalBookmark($user_id, $content_id) { - - $query = sprintf("INSERT INTO bookmarking(contentID, userID, created) - VALUES(%d, %d, now())", mysql_real_escape_string($content_id), mysql_real_escape_string($user_id)); - - - $result = mysql_query($query, $this->db); - - return ($result != false)?(true):(false); - - } - - public function removePersonalBookmark($user_id, $content_id) { - - $query = sprintf("DELETE FROM bookmarking - WHERE contentID=%d AND userID=%d", mysql_real_escape_string($content_id), mysql_real_escape_string($user_id)); - - - $result = mysql_query($query, $this->db); - - return ($result != false)?(true):(false); - - } - - public function getConferenceBookmarks($conference_id) { - - $query = sprintf("SELECT bookmarking.bookmarkingID, bookmarking.userID, bookmarking.contentID - FROM bookmarking, content - WHERE (bookmarking.contentID=content.contentID) AND (content.contentType <> %s)" - ,"\"no-paper\""); - - $query .= $this->createConferenceIDString($conference_id); - - $result = mysql_query($query); - - if (!$result) { - $message = 'Invalid Query: ' . mysql_error() . "\n"; - $message .= 'Full query: ' . $query; - die($message); - } - - $bookmark_array = array(); - - while ($row = mysql_fetch_assoc($result)) { - library\Toolkit::addOrInitiatlizeArrayKeyNumerical($bookmark_array, $row['contentID']); - } - - $return_array = array(); - - foreach ($bookmark_array as $id => $num) { - $return_array[] = array("id" => $id, "num" => $num); - } - - return $return_array; - } - -} diff --git a/server/classes/headstart/preprocessing/Readme.md b/server/classes/headstart/preprocessing/Readme.md deleted file mode 100644 index 2ad43d2c3..000000000 --- a/server/classes/headstart/preprocessing/Readme.md +++ /dev/null @@ -1,50 +0,0 @@ -Pre-processing scripts -====================== - -The pre-processing scripts are used to generate a data file for the visualization. This data file contains all the necessary information for Head Start. This includes - -* Metadata for a paper -* Position of a paper -* Information about which cluster a paper belongs to -* Cluster names - -Authors -------- -Peter Kraker (peter.kraker@tugraz.at) - - -Background ----------- - -You can find more on the background of the pre-processing steps in my dissertation (chapter 5) which can be found [here](http://media.obvsg.at/p-AC11312305-2001). - - -Requirements ------------- - -To get started on the pre-processing, you need two things: - -1. A file containing all the metadata of your documents. You can find a template for this file in output/metadata.csv. -2. A file containing similarity values between the documents. A template can be found in cooc.csv. Originally, the similarity values were based on readership co-occurrence, but there are many other measures that you can use (e.g. the number of keywords or tags that two papers have in common). - -If you want to write an adapter for your favorite data source, you can create a derived class from the base class at headstart\preprocessing\connection\Connection. - -Procedure ---------- - -There are at least two steps involved in the pre-processing pipe (see main.php for an example pipe): - -* Caluclation: this step takes care of multidimensional scaling (i.e. the ordination of the papers) and clustering (i.e. splitting the papers into research areas). See headstart\preprocessing\calculation\RCalculation for an example. - -* Naming: introduces names for each area. See headstart\preprocessing\naming\ApiNaming. - -Before you get started, please create a copy of the file /server/preprocessing/conf/config.ini and rename it to config\_local.ini. Now you need to set a few variables: - -* general/preprocessing_dir: Full path of the preprocessing directory. -* calculation/binary: Full path to your R binary. -* naming/api\_key\_zemanta and naming/api\_key\_calais: The naming the clusters is done with the help of the APIs of Zemanta and OpenCalais. Before you can get started you need to acquire (free) API keys for both of them and enter them in the config file. - -Then you can run the pipeline found in main.php. - - - diff --git a/server/classes/headstart/preprocessing/calculation/RCalculation.php b/server/classes/headstart/preprocessing/calculation/RCalculation.php index 7c75f0c94..7449e7406 100644 --- a/server/classes/headstart/preprocessing/calculation/RCalculation.php +++ b/server/classes/headstart/preprocessing/calculation/RCalculation.php @@ -44,9 +44,16 @@ public function performCalculationAndReturnOutputAsJSON($working_dir, $query, $p } //library\Toolkit::info($path); - exec($path, $output_r); - - return $output_r; + // exec(ps(if [ $(ps -ef | grep -v grep | grep scrape_data.php | wc -l) -lt 1 ])) if n > 5 + // return $try_later with error handling + if(exec("ps -u www-data | grep R | wc -l") <= 8) { + exec($path, $output_r); + return $output_r; + } else { + $output = array(json_encode(array("status" => "error", + "reason" => "dataprocessing rate limit"))); + return $output; + } } public function performStreamgraphCalculation($working_dir, $service, $output_json) { diff --git a/server/classes/headstart/preprocessing/connection/Connection.php b/server/classes/headstart/preprocessing/connection/Connection.php deleted file mode 100644 index 45f9362a9..000000000 --- a/server/classes/headstart/preprocessing/connection/Connection.php +++ /dev/null @@ -1,26 +0,0 @@ -ini_array = $ini_array; - } - - public function establishConnection() { - - $ini = $this->ini_array["connection"]; - - $this->db = mysql_connect( $ini["host"] . ":" . $ini["port"], - $ini["username"], $ini["password"] ); - - if(!$this->db) - throw new \Exception("Failed to connect to MySQL: " . mysql_error()); - - mysql_set_charset('utf8',$this->db); - - mysql_select_db($ini["db"]); - } - - public function writeCoocFile($conference_id, $cut_off, $file_path) { - - $libraries = $this->getBookmarks($conference_id, $cut_off); - - $count = 0; - - $cooc = array(); - - foreach($libraries as $line) { - fwrite(STDOUT, "Processing line #".$count."\n"); - foreach ($line as $article) { - - if (!array_key_exists($article, $this->numBookmarks)) - continue; - - if($article != "") { - for($row=0; $row < count($line); $row++) { - $coarticle = $line[$row]; - - if (!array_key_exists($coarticle, $this->numBookmarks)) - continue; - - if($coarticle != "") { - if(!isset($cooc[$article.",".$coarticle])) { - $cooc[$article.",".$coarticle] = 1; - } else { - $cooc[$article.",".$coarticle] += 1; - fwrite(STDOUT, "Added to ".$article.",".$coarticle.": ".$cooc[$article.",".$coarticle]."\n"); - } - } - } - } - } - $count++; - } - - arsort($cooc, SORT_NUMERIC); - - - $file_out = library\Toolkit::openOrCreateFile($file_path); - foreach($cooc as $entry=>$count) { - if(intval($count) >= 1) { - fwrite($file_out, $entry.",".$count."\n"); - } - } - fclose($file_out); - } - - public function writeMetadataFile($conference_id, $file, $cut_off) { - - $query_authors = sprintf("SELECT DISTINCT content.contentID, author.name - FROM content, authorpresenter, author - WHERE (content.contentID=authorpresenter.contentID) AND (authorpresenter.authorID=author.authorID) - AND (content.contentType <> %s)" ,"\"no-paper\""); - - $query_authors .= $this->createConferenceIDString($conference_id); - - $result_authors = mysql_query($query_authors); - - $paper_authors = array(); - - while ($row = mysql_fetch_assoc($result_authors)) { - library\Toolkit::addOrInitiatlizeArrayKey($paper_authors, $row['contentID'], $row['name']); - } - - $query = sprintf("SELECT DISTINCT content.contentID, content.title, content.abstract, content.contentType, content.contentTrack, content.contentLink - FROM content - WHERE (content.contentType <> %s)" ,"\"no-paper\""); - - $query .= $this->createConferenceIDString($conference_id); - - $result = mysql_query($query); - - $file = fopen($file, "w+"); - - fputcsv($file, array('id', 'title', 'paper_abstract', 'published_in', 'year', 'url', 'readers', 'authors')); - - while ($row = mysql_fetch_assoc($result)) { - - if (!array_key_exists($row['contentID'], $this->numBookmarks)) { - $row[] = 0; - } else { - $row[] = $this->numBookmarks[$row['contentID']]; - } - - $author_string = ""; - - if(isset($paper_authors[$row['contentID']])) { - foreach($paper_authors[$row['contentID']] as $author) { - $author_string .= $author . ";"; - } - } - - $row[] = $author_string; - - fputcsv($file, $row); - } - - } - - protected function getBookmarks($conference_id, $cut_off) { - - $query = sprintf("SELECT bookmarking.bookmarkingID, bookmarking.userID, bookmarking.contentID - FROM bookmarking, presentation, eventsession, content - WHERE (bookmarking.contentID=presentation.contentID) AND (bookmarking.contentID = content.contentID) - AND (presentation.eventSessionID=eventsession.eventSessionID) AND (content.contentType <> %s)" - ,"\"no-paper\""); - - $query .= $this->createConferenceIDString($conference_id); - - $result = mysql_query($query); - - if (!$result) { - $message = 'Invalid Query: ' . mysql_error() . "\n"; - $message .= 'Full query: ' . $query; - die($message); - } - - $bookmark_array = array(); - - while ($row = mysql_fetch_assoc($result)) { - $bookmark_array[] = $row; - library\Toolkit::addOrInitiatlizeArrayKeyNumerical($this->numBookmarks, $row['contentID']); - } - - if(count($this->numBookmarks) > 0) { - - $this->numBookmarks = array_filter($this->numBookmarks, - function($item) use($cut_off) { - return $item >= $cut_off; - }); - } - - $library_array = array(); - - foreach($bookmark_array as $line) { - if(!isset($library_array[$line['userID']])) - $library_array[$line['userID']] = array($line['contentID']); - else - $library_array[$line['userID']][] = $line['contentID']; - } - - return $library_array; - - } - - protected function createConferenceIDString($conference_id) { - - $query = " AND ("; - - if(is_array($conference_id)) { - foreach($conference_id as $id) { - $query .= sprintf("(content.conferenceID=%d) OR ", mysql_real_escape_string($id)); - } - - $query = substr($query, 0, strlen($query)-4); - $query .= ")"; - - } else { - $query .= sprintf("content.conferenceID=%d)", mysql_real_escape_string($conference_id)); - } - - return $query; - - } - - protected function createApiString($api) { - - $query = " AND ("; - - if(is_array($api)) { - foreach($api as $value) { - $query .= sprintf("(concepts.conceptAPI=\"%s\") OR ", mysql_real_escape_string($value)); - } - - $query = substr($query, 0, strlen($query)-4); - $query .= ")"; - - } else { - $query .= sprintf("concepts.conceptAPI=\"%s\")", mysql_real_escape_string($api)); - } - - return $query; - - } -} diff --git a/server/classes/headstart/preprocessing/connection/DBConnectionTopics.php b/server/classes/headstart/preprocessing/connection/DBConnectionTopics.php deleted file mode 100644 index dbad697c9..000000000 --- a/server/classes/headstart/preprocessing/connection/DBConnectionTopics.php +++ /dev/null @@ -1,114 +0,0 @@ - $values) { - $rank = 1; - - foreach($values["topics"] as $value) { - - $query = sprintf("INSERT INTO concepts (contentID,conceptName,conceptAPI,conceptRank) VALUES(%d, \"%s\", \"%s\", %d)", - mysql_real_escape_string($id), mysql_real_escape_string($value), mysql_real_escape_string($api), - mysql_real_escape_string($rank)); - - $result = mysql_query($query, $this->db); - - if(!$result) { - echo mysql_error(); - } - - $rank++; - } - } - } - - public function writeKeywordsToDB($conference_id) { - - $query = "SELECT content.contentID, content.keywords - FROM content - WHERE content.contentID=content.contentID"; - - $query .= $this->createConferenceIDString($conference_id); - - $result = mysql_query($query); - - $keyword_array = array(); - - while ($row = mysql_fetch_assoc($result)) { - - $keyword_array[$row['contentID']]["topics"] = explode(", ", $row['keywords']); - - } - - $this->writeTopicsToDB($keyword_array, "keywords"); - - } - - public function writeCoocFile($conference_id, $cut_off, $file_path, $api, $normalizeStrings=false) { - $query = "SELECT DISTINCT concepts.contentID, concepts.conceptName, concepts.conceptAPI - FROM concepts, content - WHERE concepts.contentID = content.contentID"; - - $query .= $this->createConferenceIDString($conference_id); - - if($api != null) { - $query .= $this->createApiString($api); - } - - $result = mysql_query($query); - - if($result == false) { - throw new \Exception("Query failed! " . mysql_error()); - } - - $topic_array = array(); - - while ($row = mysql_fetch_assoc($result)) { - - $topic_array[$row['contentID']][] = $row['conceptName']; - - } - - $file = library\Toolkit::openOrCreateFile($file_path); - - foreach($topic_array as $id1 => $terms1) { - - if($normalizeStrings) { - $terms1 = array_map('headstart\library\Toolkit::normalizeString', $terms1); - } - - foreach($topic_array as $id2 => $terms2) { - if($id1 == $id2) - continue; - - if($normalizeStrings) { - $terms2 = array_map('headstart\library\Toolkit::normalizeString', $terms2); - } - - $terms1_unique = array_unique($terms1); - $terms2_unique = array_unique($terms2); - - $out = array($id1, $id2, count(array_intersect($terms1_unique, $terms2_unique))); - fputcsv($file, $out); - } - } - - fclose($file); - } - - protected function formatStrings($string) { - - } -} diff --git a/server/classes/headstart/preprocessing/connection/DBConnectionWordnet.php b/server/classes/headstart/preprocessing/connection/DBConnectionWordnet.php deleted file mode 100644 index b8b825e05..000000000 --- a/server/classes/headstart/preprocessing/connection/DBConnectionWordnet.php +++ /dev/null @@ -1,75 +0,0 @@ -queryDB($conference_id); - - $this->writeToFile($file_path, $result); - - } - - public function returnContents($conference_id, $limit_from = null, $limit_to = null) { - - $result = $this->queryDB($conference_id, $limit_from, $limit_to); - - $contents = array(); - - while ($row = mysql_fetch_assoc($result)) { - $contents[$row['contentID']] = $row['title'] . " " . $row['abstract']; - } - - return $contents; - } - - - protected function queryDB($conference_id, $limit_from = null, $limit_to = null) { - - $query = sprintf("SELECT DISTINCT content.contentID, content.title, content.abstract - FROM content - WHERE (content.contentType <> %s)" - ,"\"no-paper\""); - - $query .= $this->createConferenceIDString($conference_id); - - if(!is_null($limit_from) && !is_null($limit_to)) { - $query .= " LIMIT " . $limit_from . "," . $limit_to; - } - - $result = mysql_query($query, $this->db); - - return $result; - } - - protected function writeToFile($file_path, $result) { - - $file = library\Toolkit::openOrCreateFile($file_path); - - fputcsv($file, array('id', 'content')); - - while ($row = mysql_fetch_assoc($result)) { - - $merged_string = $row['title'] . " " . $row['abstract']; - - $merged_array = array($row['contentID'], $merged_string); - - fputcsv($file, $merged_array); - } - fclose($file); - - } - -} diff --git a/server/classes/headstart/preprocessing/main.php b/server/classes/headstart/preprocessing/main.php deleted file mode 100644 index 7e4467eea..000000000 --- a/server/classes/headstart/preprocessing/main.php +++ /dev/null @@ -1,22 +0,0 @@ -performCalculationAndWriteOutputToFile($WORKING_DIR); - -$naming = new naming\ApiNaming($ini_array); -$naming->performNaming($WORKING_DIR); diff --git a/server/classes/headstart/preprocessing/main_topics.php b/server/classes/headstart/preprocessing/main_topics.php deleted file mode 100644 index fa68a471d..000000000 --- a/server/classes/headstart/preprocessing/main_topics.php +++ /dev/null @@ -1,64 +0,0 @@ -establishConnection(); - -//$dbconnect->writeMetadataFile($ini_array["general"]["event_id"], -// $WORKING_DIR . $ini_array["output"]["metadata"], -// $ini_array["general"]["cut_off"]); -// -//$contents = $dbconnect->returnContents($ini_array["general"]["event_id"]); -// -$naming = new naming\ApiNaming($ini_array); -//$topics = $naming->executeCurlSensium($contents); -// -//foreach($topics as $api => $topic) { -// $cluster_names = $topics[$api]; -// $dbconnect->writeTopicsToDB($cluster_names, $api); -//} -// -//$topics = $naming->executeCurl($contents); -// -//foreach($topics as $api => $topic) { -// $cluster_names = $topics[$api]; -// $dbconnect->writeTopicsToDB($cluster_names, $api); -//} - -$dbconnect->writeCoocFile($ini_array["general"]["event_id"], - $ini_array["general"]["cut_off"], - $WORKING_DIR . $ini_array["output"]["cooc"], - array("calais", "zemanta", "sensium"), - false); - -$calculation = new calculation\RCalculation($ini_array); -$calculation->performCalculationAndWriteOutputToFile($WORKING_DIR); - -$naming->performNaming($WORKING_DIR); - diff --git a/server/classes/headstart/preprocessing/naming/ApiNaming.php b/server/classes/headstart/preprocessing/naming/ApiNaming.php deleted file mode 100644 index bf474e406..000000000 --- a/server/classes/headstart/preprocessing/naming/ApiNaming.php +++ /dev/null @@ -1,616 +0,0 @@ -ini_array["naming"]; - $ini_general = $this->ini_array["general"]; - $ini_output = $this->ini_array["output"]; - $ini_connection = $this->ini_array["connection"]; - - $WORKING_DIR = $working_dir; - - //Output of scaling and clustering script - $CLUSTERS = $WORKING_DIR . $ini_output["output_scaling_clustering"]; - - //Output file - $OUTPUT_FILE = $WORKING_DIR . $ini_output["output_naming"]; - - //Output file for the full API responses - $FULL_ZEMANTA = $WORKING_DIR . "full_responses/zemanta/"; - $FULL_CALAIS = $WORKING_DIR . "full_responses/calais/"; - - $cluster = array(); - $cluster_details = array("title" => array(), "abstracts" => array()); - $counts = array(); - $stop_words = array(); - $output = array(); - - $cluster_text_file = library\Toolkit::openFileForReading($CLUSTERS); - $stop_words_file = library\Toolkit::openFileForReading($ini_general["preprocessing_dir"] . $ini["stop_words"]); - - while (($line = fgetcsv($stop_words_file, null, "\t")) !== false) { - $this->stop_words[] = $line[0]; - } - - $row = 0; - - while (($line = fgetcsv($cluster_text_file, null)) !== false) { - if ($row == 0) { - $output[] = $line; - $row++; - continue; - } - - $output[] = $line; - - $line_cluster_id = intval($ini["line_cluster_id"]); - $line_title = intval($ini["line_title"]); - $line_abstract = intval($ini["line_abstract"]); - - if(!isset($line[$line_cluster_id])) { - throw new \Exception("Error in line: " . $line[0]); - } - - if (!isset($cluster[$line[$line_cluster_id]])) { - $cluster[$line[$line_cluster_id]] = $line[$line_title] . ". " . $line[$line_abstract]; - $cluster_details[$line[$line_cluster_id]]["title"] = $line[$line_title] . "."; - $cluster_details[$line[$line_cluster_id]]["abstracts"] = $line[$line_abstract]; - $counts[$line[$line_cluster_id]] = 1; - } else { - $cluster[$line[$line_cluster_id]] .= "\n" . $line[$line_title] . ". " . $line[$line_abstract]; - $cluster_details[$line[$line_cluster_id]]["title"] .= "\n" . $line[$line_title] . "."; - $cluster_details[$line[$line_cluster_id]]["abstracts"] .= "\n" . $line[$line_abstract]; - $counts[$line[$line_cluster_id]]++; - } - } - - $topics = $this->executeCurl($cluster); - - $cluster_names = array(); - - foreach ($cluster as $id => $text) { - - library\Toolkit::info($text); - - $categories = array("topics" => array(), "topics_title" => array()); - $categories_one = array("topics" => array()); - - //get 1-grams - $response_object_one = $this->getNgrams($cluster[$id], 1); - - $this->processNgrams($response_object_one, "topics_title", $categories_one, $ini["threshold_single_words"]); - - //get 2-, 3-, and 4-grams - for ($n = 4; $n >= 2; $n--) { - - $response_object = $this->getNgrams($cluster[$id], $n); - $response_object_title = $this->getNgrams($cluster_details[$id]["title"], $n); - - arsort($response_object); - - $this->processNgrams($response_object_title, "topics_title", $categories, $ini["threshold_title_ngrams"]); - - $this->processNgrams($response_object, "topics", $categories, $ini["threshold_title_abstract_ngrams"]); - - } - - library\Toolkit::info($id . ": " . print_r($categories, true)); - - $cluster_names_calais = $topics["calais"][$id]; - $cluster_names_zemanta = $topics["zemanta"][$id]; - - $cluster_name = ""; - - //Search for 4-, 3-, and 2-title-grams in Calais concepts - $cluster_name = $this->compareConcepts($cluster_names_calais, $categories, "topics_title"); - - //If that fails, search for 4-, 3-, and 2-title-grams in Zemanta concepts - if($cluster_name == "") { - $cluster_name = $this->compareConcepts($cluster_names_zemanta, $categories, "topics_title"); - } - - //If that fails, search for 4-, 3-, and 2-grams in Zemanta concepts - if($cluster_name == "") { - $cluster_name = $this->compareConcepts($cluster_names_zemanta, $categories, "topics"); - } - - //If that fails, search for 1-grams in Zemanta concepts - if ($cluster_name == "") { - - $count_new = 0; - - $filtered_array = array_filter($categories_one["topics_title"], function ($item) { - return !in_array($item, $this->ini_array["naming"]["forbidden_names"]); - }); - - library\Toolkit::info("Filtered Array: " . print_r($filtered_array, true)); - - - foreach ($cluster_names_zemanta["topics_format"] as $name) { - $key = array_search($name, $filtered_array); - if ($key !== false) { - $cluster_name = $cluster_names_zemanta["topics"][$count_new]; - break; - } - $count_new++; - } - } - - //If everything above fails, name the cluster after the most important concept - //returned by (1) Zemanta or (2) Calais. Finally, name the cluster - //"Miscellaneous" - if ($cluster_name == "") { - if(isset($cluster_names_zemanta["topics"][0])) { - $cluster_name = $cluster_names_zemanta["topics"][0]; - } elseif (isset($cluster_names_calais["topics"][0])) { - $cluster_name = $cluster_names_calais["topics"][0]; - } else { - $cluster_name = "Miscellaneous"; - } - } - - $cluster_id = library\Toolkit::generateUriFromString($cluster_name); - - $cluster_temp = $cluster_id; - $count = 1; - - foreach($cluster_names as $attributes) { - if($attributes["uri"] == $cluster_id) { - $cluster_id = $cluster_temp . "-" . $count; - $count++; - } - } - - $cluster_names[$id] = array("name" => $cluster_name, "uri" => $cluster_id); - - library\Toolkit::info("*** CLUSTER NAME: " . $cluster_name . "\n"); - - //Write full response for later consultation - - $this->getFullResponseZemanta($text, $cluster_id, $FULL_ZEMANTA); - - $this->getFullResponseCalais($text, $cluster_id, $FULL_CALAIS); - } - - //add areas to output array - array_push($output[0], "area_uri", "area"); - - library\Toolkit::info(sizeof($output) . "\n"); - $size = sizeof($output); - - for($counter = 1; $counter < $size; $counter++) { - - $cluster_id = $output[$counter][$line_cluster_id]; - array_push($output[$counter], $cluster_names[$cluster_id]["uri"], $cluster_names[$cluster_id]["name"]); - - library\Toolkit::info("$counter\n"); - } - - $output_handle = library\Toolkit::openOrCreateFile($OUTPUT_FILE); - - foreach ($output as $line) { - fputcsv($output_handle, $line); - } - - fclose($output_handle); - - $UNIQUE_ID = $ini_output["unique_id"]; - $TITLE = $ini_output["title"]; - $persistence = new persistence\SQLitePersistence($ini_connection["sqlite_db"]); - - $header = array_shift($output); - $json_array = array(); - foreach ($output as $row) { - $json_array[] = array_combine($header, $row); - } - - $json = json_encode($json_array); - - $persistence->createVisualization($UNIQUE_ID, $TITLE, $json); - } - - public function executeCurl($clusters) { - - //Initialize cURL multi - $mh_calais_array = array(); - $counter = 0; - $mh_calais_array_counter = 0; - - $mh_zemanta = curl_multi_init(); - $curl_calais_array = array(); - $curl_zemanta_array = array(); - - foreach ($clusters as $id => $text) { - - //Open Calais only allows only for 4 requests at a given time - if($counter % 4 == 0) { - $mh_calais_array_counter++; - $mh_calais_array[$mh_calais_array_counter] = curl_multi_init(); - } - $counter++; - - $curl_calais_array[$id] = $this->createNewCurlHandleCalais($text, "application/json"); - $curl_zemanta_array[$id] = $this->createNewCurlHandleZemanta($text, "json"); - - curl_multi_add_handle($mh_calais_array[$mh_calais_array_counter], $curl_calais_array[$id]); - curl_multi_add_handle($mh_zemanta, $curl_zemanta_array[$id]); - - } - - $active1 = null; - $active2 = null; - - // Run cURL handles - foreach($mh_calais_array as $mh_calais) { - do { - usleep(100000); - $status = curl_multi_exec($mh_calais, $active1); - - } while ($status === CURLM_CALL_MULTI_PERFORM || $active1 > 0); - - $active1 = null; - } - - do { - - usleep(10000); - $status = curl_multi_exec($mh_zemanta, $active2); - $info = curl_multi_info_read($mh_zemanta); - - } while ($status === CURLM_CALL_MULTI_PERFORM || $active2 > 0); - - $topics = array("calais" => array(), "zemanta" => array()); - - foreach($clusters as $id => $cluster) { - - $result_calais = curl_multi_getcontent($curl_calais_array[$id]); - $topics["calais"][$id] = $this->getClusterNamesCalais($result_calais); - curl_multi_remove_handle($mh_calais, $curl_calais_array[$id]); - - $result_zemanta = curl_multi_getcontent($curl_zemanta_array[$id]); - $topics["zemanta"][$id] = $this->getClusterNamesZemanta($result_zemanta); - curl_multi_remove_handle($mh_zemanta, $curl_zemanta_array[$id]); - } - - curl_multi_close($mh_calais); - curl_multi_close($mh_zemanta); - - return $topics; - - } - - public function executeCurlSensium($clusters) { - //Initialize cURL multi - $mh_sensium_array = array(); - $counter = 0; - $mh_sensium_array_counter = 0; - - foreach ($clusters as $id => $text) { - - //Open Calais only allows only for 4 requests at a given time - if($counter % 2 == 0) { - $mh_sensium_array_counter++; - $mh_sensium_array[$mh_sensium_array_counter] = curl_multi_init(); - } - $counter++; - - $curl_sensium_array[$id] = $this->createNewCurlHandleSensium($text); - - curl_multi_add_handle($mh_sensium_array[$mh_sensium_array_counter], $curl_sensium_array[$id]); - - } - - $active1 = null; - - // Run cURL handles - foreach($mh_sensium_array as $mh_sensium) { - do { - usleep(100000); - $status = curl_multi_exec($mh_sensium, $active1); - - } while ($status === CURLM_CALL_MULTI_PERFORM || $active1 > 0); - - $active1 = null; - } - - $topics = array("sensium" => array()); - - foreach($clusters as $id => $cluster) { - - $result_sensium = curl_multi_getcontent($curl_sensium_array[$id]); - $topics["sensium"][$id] = $this->getClusterNamesSensium($result_sensium); - curl_multi_remove_handle($mh_sensium, $curl_sensium_array[$id]); - - } - - curl_multi_close($mh_sensium); - - return $topics; - } - - private function compareConcepts($cluster_names, $categories, $categories_part) { - - $cluster_name = ""; - $count = 0; - - foreach ($cluster_names["topics_format"] as $name) { - - $key = array_search($name, $categories[$categories_part]); - if (!$key === false) { - $key = array_search($name, $categories["topics"]); - } - - if ($key !== false) { - $cluster_name = $cluster_names["topics"][$count]; - break; - } - $count++; - } - - return $cluster_name; - - } - - private function processNgrams($response_object, $category_title, &$categories_object, $threshold) { - - $categories_object[$category_title] = array(); - - foreach ($response_object as $name => $count) { - $name_array = preg_split('/ /', $name); - - //first and last word should not be a stop word - if (!in_array($name_array[0], $this->stop_words) - && !in_array($name_array[count($name_array) - 1], $this->stop_words) - && $count >= $threshold) { - $categories_object[$category_title][] = $name; - } - } - } - - private function getNgrams($text, $n = 3) { - - $ngrams = array(); - $new_string = preg_replace('/[^a-zA-Z0-9\s]/', '', $text); - $new_string = strtolower($new_string); - $tokens = preg_split('/\s/', $new_string); - $len = count($tokens); - for ($i = 0; $i < $len - $n; $i++) { - $ng = ''; - for ($j = $i; $j < $i + $n; $j++) { - $ng .= \Inflector::singularize(trim($tokens[$j])) . " "; - } - - $ng = trim($ng); - - if (isset($ngrams[$ng])) - $ngrams[$ng]++; - else - $ngrams[$ng] = 1; - } - return $ngrams; - } - - private function createNewCurlHandleCalais($text, $format) { - - $apiKey = $this->ini_array["naming"]["api_key_calais"]; - - $contentType = "text/xml"; - $outputFormat = $format; - - $metaDataType = "GenericRelations,SocialTags"; - - $restURL = "http://api.opencalais.com/enlighten/rest/"; - $paramsXML = " " . - " " . - " " . - "Educational Technology" . - " - "; - - - $data = "licenseID=" . urlencode($apiKey); - $data .= "¶msXML=" . urlencode($paramsXML); - $data .= "&content=" . urlencode($text); - - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $restURL); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); - curl_setopt($ch, CURLOPT_HEADER, 0); - curl_setopt($ch, CURLOPT_POSTFIELDS, $data); - curl_setopt($ch, CURLOPT_POST, 1); - curl_setopt($ch, CURLOPT_TIMEOUT, 60); - - return $ch; - - } - - private function createNewCurlHandleZemanta($text, $format) { - - $url = 'http://api.zemanta.com/services/rest/0.0/'; - $key = $this->ini_array["naming"]["api_key_zemanta"]; - $method = "zemanta.suggest"; - $categories = "dmoz"; - - /* It is easier to deal with arrays */ - $args = array( - 'method' => $method, - 'api_key' => $key, - 'text' => $text, - 'format' => $format, - 'return_rdf_links' => 1 - , 'return_categories' => 1 - , 'return_keywords' => 1 - , 'return_images' => 0 - , 'return_categories' => $categories - ); - - - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_POST, 1); - curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($args, '', '&')); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - - return $ch; - - } - - private function createNewCurlHandleSensium($text) { - - $url = 'https://api.sensium.io/v1/extract'; - $key = $this->ini_array["naming"]["api_key_sensium"]; - - $args = array( - 'apiKey' => $key, - 'text' => $text, - "extractors" => array("Summary") - ); - - $header_args = array( - 'Content-Type: application/json' - , 'Accept: application/json' - //, 'Accept-encoding: \'gzip\'' - ); - - $json_args = json_encode($args); - - $ch = curl_init(); - curl_setopt($ch, CURLOPT_URL, $url); - curl_setopt($ch, CURLOPT_HTTPHEADER, $header_args); - curl_setopt($ch, CURLOPT_POST, 1); - //curl_setopt($ch, CURLOPT_PROXY, '127.0.0.1:8888'); - //curl_setopt($ch, CURLOPT_POSTFIELDS, http_build_query($args, '', '&')); - curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); - curl_setopt($ch, CURLOPT_POSTFIELDS, $json_args); - curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); - - return $ch; - - } - - private function getClusterNamesCalais($response) { - - $response_object = json_decode($response); - - $categories = array("topics" => array(), "topics_format" => array()); - - foreach ($response_object as $entity => $attributes) { - if ($entity == "doc") { - continue; - } - - switch ($attributes->_typeGroup) { - case "socialTag" : { - $categories["topics"][] = $attributes->name; - - $final_string = library\Toolkit::normalizeString($attributes->name); - - $categories["topics_format"][] = $final_string; - break; - } - - default : - continue; - } - } - - library\Toolkit::info("Calais: " . print_r($categories["topics_format"], true)); - - return $categories; - } - - private function getClusterNamesZemanta($response) { - - $response_object = json_decode($response); - - $categories = array("topics" => array(), "topics_format" => array()); - - $links = $response_object->markup->links; - $count = 0; - - foreach ($links as $link) { - foreach ($link->target as $site) { - if (stristr($site->url, "http://dbpedia.org")) { - if ($link->entity_type != NULL) { - foreach ($link->entity_type as $entity_type) { - if (isset($categories[$entity_type])) { - $categories[$entity_type][] = $site->title; - } else { - $categories[$entity_type] = array($site->title); - } - } - } else { - $categories["topics"][] = $site->title; - $categories["topics_format"][] = library\Toolkit::normalizeString($site->title); - $count++; - } - } - } - } - - library\Toolkit::info("Zemanta: " . print_r($categories["topics_format"], true)); - - return $categories; - } - - private function getClusterNamesSensium($response) { - - $response_object = json_decode($response); - - $categories = array("topics" => array(), "topics_format" => array()); - - foreach ($response_object->summary->keyPhrases as $phrase) { - - $categories["topics"][] = $phrase->text; - - $final_string = library\Toolkit::normalizeString($phrase->text); - - $categories["topics_format"][] = $final_string; - } - - library\Toolkit::info("Sensium: " . print_r($categories["topics_format"], true)); - - return $categories; - } - - private function getFullResponseCalais($text, $uri, $dir) { - - $ch = $this->createNewCurlHandleCalais($text, "XML/RDF"); - $response = curl_exec($ch); - curl_close($ch); - - library\Toolkit::putContentsToFile($dir . $uri . ".rdf", $response); - } - - private function getFullResponseZemanta($text, $uri, $dir) { - - $ch = $this->createNewCurlHandleZemanta($text, "rdfxml"); - $response = curl_exec($ch); - curl_close($ch); - - library\Toolkit::putContentsToFile($dir . $uri . ".rdf", $response); - } -} diff --git a/server/classes/headstart/preprocessing/naming/KeywordNaming.php b/server/classes/headstart/preprocessing/naming/KeywordNaming.php deleted file mode 100644 index 3f91df2c1..000000000 --- a/server/classes/headstart/preprocessing/naming/KeywordNaming.php +++ /dev/null @@ -1,129 +0,0 @@ - $current_array) { - $counted_sorted_array = array_count_values($current_array); - arsort($counted_sorted_array); - $important_terms = array_keys(array_slice($counted_sorted_array, 0, $num_keywords)); - $final_string = implode(", ", $important_terms); - $result_array[$key] = $final_string; - } - - foreach ($array as $key => $entry) { - $array[$key]["area"] = $result_array[$entry[$id]]; - } - } - - public function performNamingTfIdf(&$array, $num_keywords, $keyword_separator, $taxonomy_separator, $id = "area_uri", $subjects = "subject") { - - $working_array = array(); - - foreach ($array as $entry) { - $uri = $entry[$id]; - $keywords = explode($keyword_separator, $entry[$subjects]); - foreach ($keywords as &$keyword) { - $keyword = preg_replace("/\/", "&#x$1;", $keyword); - if ($taxonomy_separator != null) { - $keyword = substr($keyword, strrpos($keyword, $taxonomy_separator) + 1); - } - } - - //$working_array[$uri] = array(); - - if (isset($working_array[$uri]["all_terms"])) { - $working_array[$uri]["all_terms"] = array_merge($working_array[$uri]["all_terms"], $keywords); - } else { - $working_array[$uri]["all_terms"] = $keywords; - } - } - - $num_docs_per_term = array(); - - foreach ($working_array as $uri => $current_array) { - $current_array["all_terms"] = array_filter($current_array["all_terms"]); - $current_array["all_terms"] = array_map('trim', $current_array["all_terms"]); - array_walk($current_array["all_terms"], function(&$value, &$key) { - $value = ucfirst($value); - }); - - $unique_terms = array_unique($current_array["all_terms"]); - $working_array[$uri]["unique_terms"] = $unique_terms; - - foreach ($unique_terms as $term) { - if (!isset($num_docs_per_term[$term])) - $num_docs_per_term[$term] = 1; - else - $num_docs_per_term[$term] += 1; - } - } - - $result_array = array(); - $totalDocs = count($working_array); - - foreach ($working_array as $uri => $current_array) { - - $current_array["all_terms"] = array_replace($current_array["all_terms"], array_fill_keys(array_keys($current_array["all_terms"], null), '')); - - $num_keywords_per = array_count_values($current_array["all_terms"]); - $wordCount = count($current_array["all_terms"]); - $current_result_array = array(); - - foreach ($current_array["unique_terms"] as $term) { - $termCount = isset($num_keywords_per[$term]) ? ($num_keywords_per[$term]) : (0); - $docsWithTerm = $num_docs_per_term[$term]; - - $tf = $termCount / $wordCount; - $idf = log($totalDocs / $docsWithTerm, 2); - $tfidf = $tf * $idf; - - //$tfidf_short = round($tfidf,2); - //$current_result_array[$term. " " . $tfidf_short] = $tfidf; - - $current_result_array[$term] = $tfidf; - } - - arsort($current_result_array); - - $important_terms = array_keys(array_slice($current_result_array, 0, $num_keywords)); - - $final_string = implode(", ", $important_terms); - $result_array[$uri] = $final_string; - } - - foreach ($array as $uri => $entry) { - $array[$uri]["area"] = $result_array[$entry[$id]]; - } - } - -} diff --git a/server/classes/headstart/preprocessing/naming/Naming.php b/server/classes/headstart/preprocessing/naming/Naming.php deleted file mode 100644 index 7c73c0437..000000000 --- a/server/classes/headstart/preprocessing/naming/Naming.php +++ /dev/null @@ -1,24 +0,0 @@ -ini_array = $ini_array; - - } - - public function performNaming(&$array, $num_keywords) { - - } - -} diff --git a/server/classes/headstart/preprocessing/test_rplos.php b/server/classes/headstart/preprocessing/test_rplos.php deleted file mode 100644 index 3ca817d9c..000000000 --- a/server/classes/headstart/preprocessing/test_rplos.php +++ /dev/null @@ -1,32 +0,0 @@ -performCalculationAndReturnOutputAsJSON($WORKING_DIR, "dna"); - -$output_json = end($output); - -//echo $output_json; - -$persistence = new persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); -//$persistence->createVisualization("test_rplos", "My Test RPLOS", $output_json); - -$persistence->writeRevision("test_rplos", $output_json); - -//$naming = new naming\ApiNaming($ini_array); -//$naming->performNaming($WORKING_DIR); - diff --git a/server/preprocessing/conf/config.ini b/server/preprocessing/conf/config.ini index 5ad9b1364..efea09612 100644 --- a/server/preprocessing/conf/config.ini +++ b/server/preprocessing/conf/config.ini @@ -14,6 +14,8 @@ vis_path = "path/to/vis" services_path = "server/services/" # URL to OKMaps API api_url = "http://127.0.0.1/api/" +# flavor of API, default: "stable" +api_flavor = "stable" # The persistence backend to use - either api or legacy persistence_backend = "legacy" # The processing backend to use - either api or legacy @@ -64,25 +66,3 @@ binary = "/usr/bin/Rscript" # Relative path from preprocessing_dir to the R script script = "other-scripts/text_similarity.R" mode = "bookmarks" - -[naming] -api_key_zemanta = "" -api_key_calais = "" - -# Constants for column numbers in the scaling and clustering output -line_cluster_id = 10 -line_title = 1 -line_abstract = 2 - -# English stop word file -stop_words = "resources/english.stop"; - -# Thresholds for n-grams -threshold_title_ngrams = 2; -threshold_title_abstract_ngrams = 3; -threshold_single_words = 4; - -forbidden_names[] = "research" -forbidden_names[] = "science" -forbidden_names[] = "inquiry" -forbidden_names[] = "learning" diff --git a/server/services/GSheetUpdateAvailable.php b/server/services/GSheetUpdateAvailable.php index dd0722c1b..bae8758f9 100644 --- a/server/services/GSheetUpdateAvailable.php +++ b/server/services/GSheetUpdateAvailable.php @@ -5,23 +5,21 @@ require_once dirname(__FILE__) . '/../classes/headstart/library/CommUtils.php'; require_once dirname(__FILE__) . '/../classes/headstart/library/toolkit.php'; require dirname(__FILE__) . '/../classes/headstart/persistence/SQLitePersistence.php'; +require_once dirname(__FILE__) . '/../classes/headstart/library/APIClient.php'; use headstart\library; $INI_DIR = dirname(__FILE__) . "/../preprocessing/conf/"; - $ini_array = library\Toolkit::loadIni($INI_DIR); +$apiclient = new \headstart\library\APIClient($ini_array); +$persistence = new headstart\persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); $vis_id = library\CommUtils::getParameter($_GET, "vis_id"); $gsheet_last_updated = library\CommUtils::getParameter($_GET, "gsheet_last_updated"); -$database = $ini_array["connection"]["database"]; - -$persistence = new headstart\persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); $persistence_backend = $ini_array["general"]["persistence_backend"]; if ($persistence_backend == "api") { - $route = $ini_array["general"]["api_url"] . "persistence/" . "getLastVersion/" . $database; $payload = json_encode(array("vis_id" => $vis_id, "details" => false, "context" => true)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("getLastVersion", $payload); if ($res["httpcode"] != 200) { library\CommUtils::echoOrCallback($res, $_GET); } else { diff --git a/server/services/createNewGSheet.php b/server/services/createNewGSheet.php index 69bf5ff7a..289c2a90e 100644 --- a/server/services/createNewGSheet.php +++ b/server/services/createNewGSheet.php @@ -3,8 +3,14 @@ header('Content-type: application/json'); require_once dirname(__FILE__) . '/../classes/headstart/library/CommUtils.php'; +require_once dirname(__FILE__) . '/../classes/headstart/library/APIClient.php'; +require_once dirname(__FILE__) . '/../classes/headstart/library/toolkit.php'; use headstart\library; +$INI_DIR = dirname(__FILE__) . "/../preprocessing/conf/"; +$ini_array = library\Toolkit::loadIni($INI_DIR); +$apiclient = new \headstart\library\APIClient($ini_array); + if(php_sapi_name() == 'cli') { // Called from command-line, maybe cron # parse options @@ -24,12 +30,11 @@ echo "Call not accepted."; } -$route = $ini_array["general"]["api_url"] . "/gsheets" . "/createKnowledgebase"; $payload = json_encode(array("sheet_name" => $sheet_name, "project_name" => $project_name, "main_curator_email" => $main_curator_email, "knowledge_base_template_id" => $knowledge_base_template_id)); -$res = library\CommUtils::call_api($route, $payload); +$res = $apiclient->call_api("/gsheets" . "/createKnowledgebase", $payload); if ($res["httpcode"] != 200) { echo json_encode($res); } else { diff --git a/server/services/getContext.php b/server/services/getContext.php index f9bec52a8..672c26042 100644 --- a/server/services/getContext.php +++ b/server/services/getContext.php @@ -5,6 +5,7 @@ require dirname(__FILE__) . '/../classes/headstart/persistence/SQLitePersistence.php'; require_once dirname(__FILE__) . '/../classes/headstart/library/CommUtils.php'; require_once dirname(__FILE__) . '/../classes/headstart/library/toolkit.php'; +require_once dirname(__FILE__) . '/../classes/headstart/library/APIClient.php'; use headstart\library; @@ -15,14 +16,13 @@ $vis_id = library\CommUtils::getParameter($_GET, "vis_id"); $revision_context = isset($_GET["revision_context"]) ? library\CommUtils::getParameter($_GET, "revision_context") : false; +$apiclient = new headstart\library\APIClient($ini_array); $persistence = new headstart\persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); -$database = $ini_array["connection"]["database"]; $persistence_backend = $ini_array["general"]["persistence_backend"]; if ($persistence_backend === "api") { - $route = $ini_array["general"]["api_url"] . "persistence/" . "getContext/" . $database; $payload = json_encode(array("vis_id" => $vis_id, "revision_context" => $revision_context)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("getContext", $payload); if ($res["httpcode"] != 200) { library\CommUtils::echoOrCallback($res, $_GET); } else { diff --git a/server/services/getGSheetsMap.php b/server/services/getGSheetsMap.php index 9061f00a5..777ee5495 100644 --- a/server/services/getGSheetsMap.php +++ b/server/services/getGSheetsMap.php @@ -4,6 +4,7 @@ require_once dirname(__FILE__) . '/../classes/headstart/library/CommUtils.php'; require_once dirname(__FILE__) . '/../classes/headstart/library/toolkit.php'; +require_once dirname(__FILE__) . '/../classes/headstart/library/APIClient.php'; require 'search.php'; use headstart\library; @@ -11,17 +12,15 @@ $INI_DIR = dirname(__FILE__) . "/../preprocessing/conf/"; $ini_array = library\Toolkit::loadIni($INI_DIR); +$apiclient = new \headstart\library\APIClient($ini_array); +$persistence = new headstart\persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); $vis_id = library\CommUtils::getParameter($_GET, "vis_id"); - -$persistence = new headstart\persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); $persistence_backend = $ini_array["general"]["persistence_backend"]; -$database = $ini_array["connection"]["database"]; if ($persistence_backend === "api") { - $route = $ini_array["general"]["api_url"] . "persistence/" . "getLastVersion/" . $database; $payload = json_encode(array("vis_id" => $vis_id, "details" => false, "context" => true)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("getLastVersion", $payload); if ($res["httpcode"] != 200) { library\CommUtils::echoOrCallback($res, $_GET); } else { diff --git a/server/services/getLastVersion.php b/server/services/getLastVersion.php index e2744bf61..ce96726d7 100644 --- a/server/services/getLastVersion.php +++ b/server/services/getLastVersion.php @@ -6,25 +6,24 @@ require dirname(__FILE__) . '/../classes/headstart/persistence/SQLitePersistence.php'; require_once dirname(__FILE__) . '/../classes/headstart/library/CommUtils.php'; require_once dirname(__FILE__) . '/../classes/headstart/library/toolkit.php'; +require_once dirname(__FILE__) . '/../classes/headstart/library/APIClient.php'; use headstart\library; $INI_DIR = dirname(__FILE__) . "/../preprocessing/conf/"; $ini_array = library\Toolkit::loadIni($INI_DIR); +$apiclient = new \headstart\library\APIClient($ini_array); +$persistence = new headstart\persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); $vis_id = library\CommUtils::getParameter($_GET, "vis_id"); -$database = $ini_array["connection"]["database"]; - -$persistence = new headstart\persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); $persistence_backend = $ini_array["general"]["persistence_backend"]; if ($persistence_backend === "api") { - $route = $ini_array["general"]["api_url"] . "persistence/" . "getLastVersion/" . $database; $payload = json_encode(array("vis_id" => $vis_id, "details" => false, "context" => false)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("getLastVersion", $payload); if ($res["httpcode"] != 200) { echo json_encode($res); } else { diff --git a/server/services/getLatestRevision.php b/server/services/getLatestRevision.php index d5ca3bd5e..320c456c8 100644 --- a/server/services/getLatestRevision.php +++ b/server/services/getLatestRevision.php @@ -6,23 +6,23 @@ require dirname(__FILE__) . '/../classes/headstart/persistence/SQLitePersistence.php'; require_once dirname(__FILE__) . '/../classes/headstart/library/CommUtils.php'; require_once dirname(__FILE__) . '/../classes/headstart/library/toolkit.php'; +require_once dirname(__FILE__) . '/../classes/headstart/library/APIClient.php'; use headstart\library; $INI_DIR = dirname(__FILE__) . "/../preprocessing/conf/"; - $ini_array = library\Toolkit::loadIni($INI_DIR); +$apiclient = new \headstart\library\APIClient($ini_array); +$persistence = new headstart\persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); + +$persistence_backend = $ini_array["general"]["persistence_backend"]; +$processing_backend = $ini_array["general"]["processing_backend"]; $vis_id = library\CommUtils::getParameter($_GET, "vis_id"); $context = filter_input(INPUT_GET, "context", FILTER_VALIDATE_BOOLEAN, array("flags" => FILTER_NULL_ON_FAILURE)); $streamgraph = filter_input(INPUT_GET, "streamgraph", FILTER_VALIDATE_BOOLEAN, array("flags" => FILTER_NULL_ON_FAILURE)); -$database = $ini_array["connection"]["database"]; - -$persistence = new headstart\persistence\SQLitePersistence($ini_array["connection"]["sqlite_db"]); -$persistence_backend = $ini_array["general"]["persistence_backend"]; -$processing_backend = $ini_array["general"]["processing_backend"]; if ($processing_backend == "api") { # case of streamgraph calculation in backend @@ -30,9 +30,8 @@ # context data true start if ($persistence_backend === "api") { # get data + context from api - $route = $ini_array["general"]["api_url"] . "persistence/" . "getLastVersion/" . $database; $payload = json_encode(array("vis_id" => $vis_id, "details" => false, "context" => true)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("getLastVersion", $payload); if ($res["httpcode"] != 200) { library\CommUtils::echoOrCallback($res, $_GET); } else { @@ -62,9 +61,8 @@ } else { if ($persistence_backend === "api") { # return data without context from api - $route = $ini_array["general"]["api_url"] . "persistence/" . "getLastVersion/" . $database; $payload = json_encode(array("vis_id" => $vis_id, "details" => false, "context" => false)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("getLastVersion", $payload); if ($res["httpcode"] != 200) { library\CommUtils::echoOrCallback($res, $_GET); } else { @@ -84,9 +82,8 @@ if ($context === true) { if ($persistence_backend === "api") { # get data + context from api - $route = $ini_array["general"]["api_url"] . "persistence/" . "getLastVersion/" . $database; $payload = json_encode(array("vis_id" => $vis_id, "details" => false, "context" => true)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("getLastVersion", $payload); if ($res["httpcode"] != 200) { library\CommUtils::echoOrCallback($res, $_GET); } else { @@ -117,9 +114,8 @@ } else { if ($persistence_backend === "api") { # get data without context from api - $route = $ini_array["general"]["api_url"] . "persistence/" . "getLastVersion/" . $database; $payload = json_encode(array("vis_id" => $vis_id, "details" => false, "context" => false)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("getLastVersion", $payload); if ($res["httpcode"] != 200) { library\CommUtils::echoOrCallback($res, $_GET); } else { diff --git a/server/services/search.php b/server/services/search.php index f2b522366..adf34b333 100644 --- a/server/services/search.php +++ b/server/services/search.php @@ -1,12 +1,11 @@ "PLOS" , "pubmed" => "PubMed" , "doaj" => "DOAJ" @@ -74,13 +78,6 @@ function search($service_integration, $dirty_query $service2endpoint = array("triple_km" => "triple", "triple_sg" => "triple"); - $processing_backend = isset($ini_array["general"]["processing_backend"]) - ? ($ini_array["general"]["processing_backend"]) - : "legacy"; - $persistence_backend = isset($ini_array["general"]["persistence_backend"]) - ? ($ini_array["general"]["persistence_backend"]) - : "legacy"; - $query = ($do_clean_query === true) ?(cleanQuery($dirty_query, $transform_query_tolowercase)) :($dirty_query); @@ -96,10 +93,9 @@ function search($service_integration, $dirty_query $params_for_id_creation = ($params_for_id === null)?($params_json):(packParamsJSON($params_for_id, $post_params)); if ($persistence_backend === "api") { - $route = $ini_array["general"]["api_url"] . "persistence/" . "createID"; $payload = json_encode(array("params" => $post_params, "param_types" => $param_types)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("createID", $payload); if ($res["httpcode"] != 200) { echo json_encode($res); } else { @@ -115,11 +111,10 @@ function search($service_integration, $dirty_query if($retrieve_cached_map) { if ($persistence_backend === "api") { - $route = $ini_array["general"]["api_url"] . "persistence/" . "getLastVersion/" . $database; $payload = json_encode(array("vis_id" => $unique_id, "details" => false, "context" => false)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("getLastVersion", $payload); if ($res["httpcode"] != 200) { echo json_encode($res); } else { @@ -134,23 +129,20 @@ function search($service_integration, $dirty_query } } - $params_file = tmpfile(); - $params_meta = stream_get_meta_data($params_file); - $params_filename = $params_meta["uri"]; - fwrite($params_file, $params_json); - - $WORKING_DIR = $ini_array["general"]["preprocessing_dir"] . $ini_array["output"]["output_dir"]; - if ($processing_backend === "api") { - $route = $ini_array["general"]["api_url"] . $endpoint . "/search"; $payload = json_encode($post_params); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_api($endpoint . "/search", $payload); if ($res["httpcode"] != 200) { - return $res; + return json_encode($res); } else { $output_json = $res["result"]; } } else { + $params_file = tmpfile(); + $params_meta = stream_get_meta_data($params_file); + $params_filename = $params_meta["uri"]; + fwrite($params_file, $params_json); + $WORKING_DIR = $ini_array["general"]["preprocessing_dir"] . $ini_array["output"]["output_dir"]; $calculation = new \headstart\preprocessing\calculation\RCalculation($ini_array); $output = $calculation->performCalculationAndReturnOutputAsJSON($WORKING_DIR, $query, $params_filename, $endpoint); @@ -179,11 +171,10 @@ function search($service_integration, $dirty_query $vis_title = $service_integration; if ($persistence_backend === "api") { - $route = $ini_array["general"]["api_url"] . "persistence/" . "existsVisualization/" . $database; $payload = json_encode(array("vis_id" => $unique_id)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("existsVisualization", $payload); if ($res["httpcode"] != 200) { - return $res; + return json_encode($res); } else { $result = json_decode($res["result"], true); $exists = $result["exists"]; @@ -194,28 +185,26 @@ function search($service_integration, $dirty_query if (!$exists) { if ($persistence_backend === "api") { - $route = $ini_array["general"]["api_url"] . "persistence/" . "createVisualization/" . $database; $payload = json_encode(array("vis_id" => $unique_id, "vis_title" => $vis_title, "data" => $input_json, "vis_clean_query" => $query, "vis_query" => $dirty_query, "vis_params" => $params_json)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("createVisualization", $payload); if ($res["httpcode"] != 200) { - return $res; + return json_encode($res); } } else { $persistence->createVisualization($unique_id, $vis_title, $input_json, $query, $dirty_query, $params_json); } } else { if ($persistence_backend === "api") { - $route = $ini_array["general"]["api_url"] . "persistence/" . "writeRevision/" . $database; $payload = json_encode(array("vis_id" => $unique_id, "data" => $input_json)); - $res = library\CommUtils::call_api($route, $payload); + $res = $apiclient->call_persistence("writeRevision", $payload); if ($res["httpcode"] != 200) { - return $res; + return json_encode($res); } } else { $persistence->writeRevision($unique_id, $input_json); diff --git a/server/services/searchBASE.php b/server/services/searchBASE.php index 28b92a489..9c73362a4 100644 --- a/server/services/searchBASE.php +++ b/server/services/searchBASE.php @@ -27,11 +27,10 @@ $result = search("base", $dirty_query , $post_params, $params_array - , ";", null, true + , true , true, null, 3 , "area_uri", "subject" - , $precomputed_id, false - , "legacy", "legacy"); + , $precomputed_id, false); echo $result diff --git a/server/services/searchDOAJ.php b/server/services/searchDOAJ.php index 47e4cc7b0..e2000ac04 100644 --- a/server/services/searchDOAJ.php +++ b/server/services/searchDOAJ.php @@ -14,11 +14,10 @@ $result = search("doaj", $dirty_query , $post_params, array("from", "to", "today", "sorting") - , ";", null, true + , true , true, null, 3 , "area_uri", "subject" - , $precomputed_id, false - , "legacy", "legacy"); + , $precomputed_id, false); echo $result diff --git a/server/services/searchLinkedCat.php b/server/services/searchLinkedCat.php index 7595a92de..a0aa59126 100644 --- a/server/services/searchLinkedCat.php +++ b/server/services/searchLinkedCat.php @@ -15,10 +15,9 @@ $result = search("linkedcat", $dirty_query, $post_params, array("from", "to", "include_content_type", "today", "vis_type"), - ";", null, $transform_query_tolowercase=false, true, null, 3, - "area_uri", "subject", $precomputed_id, true, - "legacy"); + "area_uri", "subject", + $precomputed_id, true); echo $result diff --git a/server/services/searchLinkedCatAuthorview.php b/server/services/searchLinkedCatAuthorview.php index a61e24518..0c7ea8e83 100644 --- a/server/services/searchLinkedCatAuthorview.php +++ b/server/services/searchLinkedCatAuthorview.php @@ -16,11 +16,8 @@ $dirty_query, $post_params, array("today", "author_id", "doc_count", "living_dates", "image_link", "vis_type"), - ";", - null, $transform_query_tolowercase = false, true, null, 3, - "area_uri", "subject", $precomputed_id, true, - "legacy" + "area_uri", "subject", $precomputed_id, true ); echo $result diff --git a/server/services/searchLinkedCatBrowseview.php b/server/services/searchLinkedCatBrowseview.php index 41c5e9221..1b47d3971 100644 --- a/server/services/searchLinkedCatBrowseview.php +++ b/server/services/searchLinkedCatBrowseview.php @@ -16,11 +16,9 @@ $dirty_query, $post_params, array("today", "bkl_level", "bkl_list", "doc_count", "bkl_top_caption", "from", "to", "include_content_type"), - ";", - null, $transform_query_tolowercase = false, true, null, 3, - "area_uri", "subject", $precomputed_id, true, - "legacy" + "area_uri", "subject", + $precomputed_id, true ); echo $result diff --git a/server/services/searchOpenAire.php b/server/services/searchOpenAire.php index eb9fce663..4a8953a68 100644 --- a/server/services/searchOpenAire.php +++ b/server/services/searchOpenAire.php @@ -25,11 +25,10 @@ "openaire_link", "obj_id", "acronym") - , ";", null, false + , false , true, array("project_id", "funder"), 3 , "area_uri", "subject" - , null, true, - "legacy", "legacy"); + , null, true); echo $result diff --git a/server/services/searchPLOS.php b/server/services/searchPLOS.php index 70bcccb25..8ef5c0c37 100644 --- a/server/services/searchPLOS.php +++ b/server/services/searchPLOS.php @@ -14,9 +14,9 @@ $result = search("plos", $dirty_query, $post_params , array("article_types", "journals", "from", "to", "sorting") - , ";", "/", true, true, null, 3 - , "area_uri", "subject", $precomputed_id, false - , "legacy"); + , true, true, null, 3 + , "area_uri", "subject" + , $precomputed_id, false); echo $result diff --git a/server/services/searchPubmed.php b/server/services/searchPubmed.php index c3f535862..405310700 100644 --- a/server/services/searchPubmed.php +++ b/server/services/searchPubmed.php @@ -18,11 +18,10 @@ $result = search("pubmed", $dirty_query , $post_params, $query_params - , ";", null, true + , true , true, null, 3 , "area_uri", "subject" - , $precomputed_id, false - , "legacy", "legacy"); + , $precomputed_id, false); echo $result diff --git a/server/services/searchTRIPLE.php b/server/services/searchTRIPLE.php index 345ba2018..48a12b342 100644 --- a/server/services/searchTRIPLE.php +++ b/server/services/searchTRIPLE.php @@ -27,9 +27,8 @@ $result = search($service_integration, $dirty_query , $post_params, $param_types - , ";", null, true - , true, null, 3 - , "area_uri", "subject" + , true + , true, null , $precomputed_id, true); echo $result diff --git a/server/workers/README.md b/server/workers/README.md index 0d9e4159a..760a706cf 100644 --- a/server/workers/README.md +++ b/server/workers/README.md @@ -14,21 +14,11 @@ Each comes with a docker file (ending on `.docker`), which is used for creating Please follow the install instructions for your OS: -* Windows: https://docs.docker.com/docker-for-windows/install/ * Mac: https://docs.docker.com/docker-for-mac/install/ * Ubuntu: https://docs.docker.com/docker-for-mac/install/ (also available for other Linux) Please follow the install instructions for docker-compose for your OS: https://docs.docker.com/compose/install/ -### Windows - -It is recommended to install the latest version of [Docker for Windows](https://hub.docker.com/editions/community/docker-ce-desktop-windows). -Additionally, following settings may need to be activated: - -* [Volume Sharing](https://docs.microsoft.com/en-us/visualstudio/containers/troubleshooting-docker-errors?view=vs-2019) - -(In case Docker for Windows does not seem to start, it may be already running in the background and hiding in the task bar menu in the lower right corner.) - ### Setting up the Apache2 reverse proxy Following Apache2 mods have to be installed and enabled: @@ -49,11 +39,12 @@ The following lines have to be added to the appropriate sites-available config o # other config # Proxy server settings for Head Start API - ProxyPass /api http://localhost:5001/api connectiontimeout=120 timeout=120 - ProxyPassReverse /api http://localhost:5001/api - ProxyPass /swaggerui http://localhost:5001/swaggerui - ProxyPassReverse /swaggerui http://localhost:5001/swaggerui - + + Deny from all + Allow from 127.0.0.1 + ProxyPass http://127.0.0.1:8080/ + ProxyPassReverse http://127.0.0.1/api + ``` @@ -71,10 +62,8 @@ Services: * In `server/workers/services/src/config` copy `example_settings.py` to `settings.py` and change the values for `ENV` (`development` or `production`) and `DEBUG` (`TRUE` or `FALSE`). * In `settings.py` you can also configure databases. - TRIPLE ElasticSearch core service: -* In `server/workers/services/triple/` copy `example_es_config.json` to `es_config.json` and fill in the fields. -* In `server/workers/services/triple/` copy `example_triple.env` to `triple.env` and change the values if necessary. +* In `server/workers/services/triple/` copy `example_triple.env` to `triple.env` and edit the values regarding the ElasticSearch access accordingly. GSheets Google API client authentication credentials:: * In `server/workers/services/gsheets/` copy `example_gsheets.env` to `gsheets.env` and change the values if necessary. @@ -82,79 +71,84 @@ GSheets Google API client authentication credentials:: Secure Redis: -* In `server/workers` copy `example_redis_config.json` to `redis_config.json` and `example_redis.conf` to `redis.conf` and in both files replace "long_secure_password" with a long, secure password (Line 507 in redis.conf, parameter `requirepass`). +* In `server/workers` copy `example_redis.conf` to `redis.conf` and replace "long_secure_password" with a long, secure password (Line 507 in redis.conf, parameter `requirepass`). + +Secure Postgres: +* In `server/workers` duplicate `example_pg_hba.conf` to `pg_hba.conf` and review the settings. The default values should be ok for a default deployment (host connections are only allowed for user "headstart" with an md5-hashed password), but you may want to change access rights. +Overall deployment environment variables: PostgreSQL service: -* In root folder create `.env` from the `example.env` and fill in the environment variables with the correct login data. -* Manual database creation: +* In `server/workers/flavorconfigs` folder create a new `flavorname.env` from the `example.env` and fill in the environment variables with the correct login data. + * This includes Postgresql and redis settings + -Enter container: `docker exec -it VARYINGNAME_pgsql_1 psql -U headstart` +* Manual database creation for Postgres: + +Enter container: `docker exec -it VARYINGNAME_db_1 psql -U headstart` Execute command: `CREATE DATABASE databasename;` +Exit the container and re-enter it as normal user: `docker exec -it VARYINGNAME_db_1 /bin/bash` + +Execute command: `python manage.py` + * In `preprocessing/conf/config_local.ini` change "databasename" to the dev/production database name for the specific integration. This should be in line with the database names provided in `settings.py` -Secure Postgres: -* In `server/workers` duplicate `example_pg_hba.conf` to `pg_hba.conf` and review the settings. The default values should be ok for a default deployment (host connections are only allowed for user "headstart" with an md5-hashed password), but you may want to change access rights. -### Starting the backend services with docker-compose +### Adding a new versioned "flavor" of the backend -Following commands have to be executed from the root folder of the repository, where `docker-compose.yml` is located. -**Build images** +1. Make changes to code in `server/workers` (any API /integration, …) +1. Commit changes +1. Checkout commit (make note of commit hash) +1. Run `server/workers/build_docker_images.sh` +1. Create new {flavor}.env in `server/workers/flavorconfigs/` using `example.env` as template. Set the “COMPOSE_PROJECT_NAME={flavor}” and the SERVICE_VERSION={commit hash} to the values from step 3. +1. Run `docker-compose up --env-file server/workers/flavorconfigs/flavor.env -d` to start the services +1. Add new entry to `server/workers/proxy/templates/default.conf.templates` +1. Add flavored networks to `server/workers/proxy/docker-compose.yml` so that the Nginx-proxy knows where to find the specific versioned services +1. Down and up the proxy service from `server/workers/proxy` working directory +1. Test by e.g. `curl -vvvv localhost/api/{flavor}/triple/service_version` -* on Linux: -``` -docker-compose build -``` -* on Windows: -``` -docker-compose -f docker-compose_win.yml build -``` +### Starting a specific versioned "flavor" of the backend services with docker-compose + +Following commands have to be executed from the root folder of the repository, where `docker-compose.yml` is located. **Start services and send them to the docker daemon** -* on Linux: ``` -docker-compose up -d +docker-compose --env-file server/workers/flavorconfigs/flavor.env up -d ``` -* on Windows: -``` -docker-compose -f docker-compose_win.yml up -d -``` -**All in one:** +**Shutting service down** -* on Linux: ``` -docker-compose up -d --build +docker-compose --env-file server/workers/flavorconfigs/flavor.env down ``` -* shut service down -* on Linux: -``` -docker-compose down -``` - -* on Windows: -``` -docker-compose -f docker-compose_win.yml down -``` +### Adding a new service to the backend -### Deploying the example: +1. Add service configuration in docker-compose.yml + 1. Add required environment variables that need to be passed from .env to container in docker-compose.yml +1. Add service related changes in build-docker-images.sh + 1. Add service to build list +1. Add service source code and Dockerfile in a new folder in `server/workers` +1. Add new env variables to .env files -Use a deployment script, or manually deploy an example (currently only TRIPLE is integrated in this way) as described in [HOWTO: search repos](../../doc/howto_search_repos.md): -Additionally, the `config_local.ini` now requires an additional parameter under `[general]`: +### Integrating with clients +In `server/preprocessing/conf/config_local.ini` change the following configs: ``` # URL to OKMaps API -api_url = "" - -``` - -where `api_url` is the full URL to the API endpoint. +api_url = "http://127.0.0.1/api/" +# flavor of API, default: "stable" +api_flavor = "stable" +# The persistence backend to use - either api or legacy +persistence_backend = "api" +# The processing backend to use - either api or legacy +processing_backend = "api" +``` \ No newline at end of file diff --git a/server/services.docker b/server/workers/api/Dockerfile similarity index 61% rename from server/services.docker rename to server/workers/api/Dockerfile index 07e014cdb..9e5271193 100644 --- a/server/services.docker +++ b/server/workers/api/Dockerfile @@ -6,11 +6,10 @@ RUN apt-get update RUN apt-get install -y --no-install-recommends gcc RUN apt-get install -y --no-install-recommends git -WORKDIR /headstart -COPY workers/services/requirements.txt . +WORKDIR /api +COPY workers/api/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt RUN pip install git+https://github.com/python-restx/flask-restx -COPY workers/services/src/ ./ -COPY workers/redis_config.json . +COPY workers/api/src/ ./ + -CMD gunicorn --workers 10 --threads 2 -b 127.0.0.1:5001 'app:app' --timeout 300 diff --git a/server/workers/services/requirements.txt b/server/workers/api/requirements.txt similarity index 100% rename from server/workers/services/requirements.txt rename to server/workers/api/requirements.txt diff --git a/server/workers/services/src/__init__.py b/server/workers/api/src/__init__.py similarity index 100% rename from server/workers/services/src/__init__.py rename to server/workers/api/src/__init__.py diff --git a/server/workers/services/src/apis/__init__.py b/server/workers/api/src/apis/__init__.py similarity index 100% rename from server/workers/services/src/apis/__init__.py rename to server/workers/api/src/apis/__init__.py diff --git a/server/workers/services/src/apis/base.py b/server/workers/api/src/apis/base.py similarity index 91% rename from server/workers/services/src/apis/base.py rename to server/workers/api/src/apis/base.py index e4e4e9bc5..3cbad3426 100644 --- a/server/workers/services/src/apis/base.py +++ b/server/workers/api/src/apis/base.py @@ -13,13 +13,15 @@ from apis.utils import get_key -with open("redis_config.json") as infile: - redis_config = json.load(infile) - -redis_store = redis.StrictRedis(**redis_config) - base_ns = Namespace("base", description="BASE API operations") +redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") +} +redis_store = redis.StrictRedis(**redis_config) search_param_schema = SearchParamSchema() @@ -94,3 +96,11 @@ def post(self): except Exception as e: base_ns.logger.error(e) abort(500, "Problem encountered, check logs.") + + + +@base_ns.route('/service_version') +class ServiceVersion(Resource): + def get(self): + result = {"service_version": os.getenv("SERVICE_VERSION")} + return make_response(result, 200, {"Content-Type": "application/json"}) \ No newline at end of file diff --git a/server/workers/services/src/apis/gsheets.py b/server/workers/api/src/apis/gsheets.py similarity index 91% rename from server/workers/services/src/apis/gsheets.py rename to server/workers/api/src/apis/gsheets.py index 360dd5bfb..bcc9baa5c 100644 --- a/server/workers/services/src/apis/gsheets.py +++ b/server/workers/api/src/apis/gsheets.py @@ -8,10 +8,12 @@ from flask_restx import Namespace, Resource, fields from apis.utils import get_key - -with open("redis_config.json") as infile: - redis_config = json.load(infile) - +redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") +} redis_store = redis.StrictRedis(**redis_config) gsheets_ns = Namespace("google_sheets", description="Google Sheets API operations") @@ -100,3 +102,9 @@ def post(self): except Exception as e: gsheets_ns.logger.error(e) abort(500, "Problem encountered during processing, sorry.") + +@gsheets_ns.route('/service_version') +class ServiceVersion(Resource): + def get(self): + result = {"service_version": os.getenv("SERVICE_VERSION")} + return make_response(result, 200, {"Content-Type": "application/json"}) \ No newline at end of file diff --git a/server/workers/services/src/apis/openaire.py b/server/workers/api/src/apis/openaire.py similarity index 90% rename from server/workers/services/src/apis/openaire.py rename to server/workers/api/src/apis/openaire.py index 13eaea1aa..4ed1e9a48 100644 --- a/server/workers/services/src/apis/openaire.py +++ b/server/workers/api/src/apis/openaire.py @@ -13,9 +13,12 @@ from apis.utils import get_key -with open("redis_config.json") as infile: - redis_config = json.load(infile) - +redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") +} redis_store = redis.StrictRedis(**redis_config) openaire_ns = Namespace("openaire", description="OpenAIRE API operations") @@ -92,3 +95,9 @@ def post(self): except Exception as e: openaire_ns.logger.error(e) abort(500, "Problem encountered, check logs.") + +@openaire_ns.route('/service_version') +class ServiceVersion(Resource): + def get(self): + result = {"service_version": os.getenv("SERVICE_VERSION")} + return make_response(result, 200, {"Content-Type": "application/json"}) \ No newline at end of file diff --git a/server/workers/services/src/apis/pubmed.py b/server/workers/api/src/apis/pubmed.py similarity index 91% rename from server/workers/services/src/apis/pubmed.py rename to server/workers/api/src/apis/pubmed.py index fa04bc870..06dd0b1c5 100644 --- a/server/workers/services/src/apis/pubmed.py +++ b/server/workers/api/src/apis/pubmed.py @@ -13,9 +13,12 @@ from apis.utils import get_key -with open("redis_config.json") as infile: - redis_config = json.load(infile) - +redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") +} redis_store = redis.StrictRedis(**redis_config) pubmed_ns = Namespace("pubmed", description="PubMed API operations") @@ -94,3 +97,9 @@ def post(self): except Exception as e: pubmed_ns.logger.error(e) abort(500, "Problem encountered, check logs.") + +@pubmed_ns.route('/service_version') +class ServiceVersion(Resource): + def get(self): + result = {"service_version": os.getenv("SERVICE_VERSION")} + return make_response(result, 200, {"Content-Type": "application/json"}) \ No newline at end of file diff --git a/server/workers/services/src/apis/request_validators.py b/server/workers/api/src/apis/request_validators.py similarity index 100% rename from server/workers/services/src/apis/request_validators.py rename to server/workers/api/src/apis/request_validators.py diff --git a/server/workers/services/src/apis/triple.py b/server/workers/api/src/apis/triple.py similarity index 84% rename from server/workers/services/src/apis/triple.py rename to server/workers/api/src/apis/triple.py index c31bd59d8..d1a6c9a2e 100644 --- a/server/workers/services/src/apis/triple.py +++ b/server/workers/api/src/apis/triple.py @@ -13,9 +13,12 @@ from apis.utils import get_key, detect_error -with open("redis_config.json") as infile: - redis_config = json.load(infile) - +redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") +} redis_store = redis.StrictRedis(**redis_config) triple_ns = Namespace("triple", description="TRIPLE API operations") @@ -58,6 +61,10 @@ class Search(Resource): def post(self): """ """ + if redis_store.llen("input_data") > 10: + result = {"status": "error", + "reason": "dataprocessing rate limit"} + return jsonify(result) params = request.get_json() triple_ns.logger.debug(params) errors = search_param_schema.validate(params, partial=True) @@ -69,6 +76,11 @@ def post(self): d = {"id": k, "params": params, "endpoint": "search"} triple_ns.logger.debug(d) + # if length of queue > ?? + # make_response with "wait later" and 503+headers + # have this handled by lightweight-client in search.php + # which then is handled by search-flow under new processing-timeout error + # add to logging redis_store.rpush("triple", json.dumps(d)) result = get_key(redis_store, k) headers = {} @@ -124,3 +136,16 @@ def get(self): return make_response(result, 200, headers) + + +@triple_ns.route('/service_version') +class ServiceVersion(Resource): + def get(self): + result = {"service_version": os.getenv("SERVICE_VERSION")} + return make_response(result, 200, {"Content-Type": "application/json"}) + +@triple_ns.route('/healthcheck') +class Healthcheck(Resource): + def get(self): + result = {"status": "I'm good"} + return make_response(result, 200, {"Content-Type": "application/json"}) \ No newline at end of file diff --git a/server/workers/services/src/apis/utils.py b/server/workers/api/src/apis/utils.py similarity index 100% rename from server/workers/services/src/apis/utils.py rename to server/workers/api/src/apis/utils.py diff --git a/server/workers/services/src/app.py b/server/workers/api/src/app.py similarity index 84% rename from server/workers/services/src/app.py rename to server/workers/api/src/app.py index 8127daaff..300c8f430 100644 --- a/server/workers/services/src/app.py +++ b/server/workers/api/src/app.py @@ -1,6 +1,6 @@ import os import sys -from flask import Flask +from flask import Flask, redirect, url_for from flask_restx import Api from flask_cors import CORS from werkzeug.middleware.proxy_fix import ProxyFix @@ -10,10 +10,9 @@ from apis.base import base_ns from apis.pubmed import pubmed_ns from apis.openaire import openaire_ns -from apis.persistence import persistence_ns -from config import settings -from utils.monkeypatches import ReverseProxied, __schema__, specs_url, _register_apidoc, inject_flasgger +import settings +from utils.monkeypatches import ReverseProxied, __schema__, specs_url, _register_apidoc import logging @@ -28,17 +27,16 @@ def api_patches(app, settings): description="Head Start API demo", version="0.1", prefix='/api', - doc="/api/docs") + doc="/docs") if settings.BEHIND_PROXY: api_fixed.behind_proxy = True return api_fixed app = Flask('v1', instance_relative_config=True) -app.config.from_object('config.settings') +app.config.from_object('settings') handler = logging.StreamHandler(sys.stdout) handler.setLevel(app.logger.level) -app = inject_flasgger(app) app.wsgi_app = ProxyFix(app.wsgi_app, x_proto=1, x_port=1, x_for=1, x_host=1, x_prefix=1) app.wsgi_app = ReverseProxied(app.wsgi_app) CORS(app, expose_headers=["Content-Disposition", "Access-Control-Allow-Origin"]) @@ -49,7 +47,7 @@ def api_patches(app, settings): api.add_namespace(base_ns, path='/base') api.add_namespace(pubmed_ns, path='/pubmed') api.add_namespace(openaire_ns, path='/openaire') -api.add_namespace(persistence_ns, path='/persistence') + app.logger.debug(app.config) app.logger.debug(app.url_map) diff --git a/server/workers/api/src/example_settings.py b/server/workers/api/src/example_settings.py new file mode 100644 index 000000000..423d54eef --- /dev/null +++ b/server/workers/api/src/example_settings.py @@ -0,0 +1,6 @@ +BEHIND_PROXY = True +SWAGGER_BASEPATH = "" +DEFAULT_DATABASE = "dev" +DATABASES = ["test"] +ENV = "development" +DEBUG = True diff --git a/server/workers/services/src/templates/tables.html b/server/workers/api/src/templates/tables.html similarity index 100% rename from server/workers/services/src/templates/tables.html rename to server/workers/api/src/templates/tables.html diff --git a/server/workers/services/src/config/__init__.py b/server/workers/api/src/utils/__init__.py similarity index 100% rename from server/workers/services/src/config/__init__.py rename to server/workers/api/src/utils/__init__.py diff --git a/server/workers/services/src/utils/monkeypatches.py b/server/workers/api/src/utils/monkeypatches.py similarity index 100% rename from server/workers/services/src/utils/monkeypatches.py rename to server/workers/api/src/utils/monkeypatches.py diff --git a/server/base.docker b/server/workers/base/Dockerfile similarity index 98% rename from server/base.docker rename to server/workers/base/Dockerfile index 02e03c0a8..e968afa22 100644 --- a/server/base.docker +++ b/server/workers/base/Dockerfile @@ -156,7 +156,5 @@ COPY preprocessing/resources ./resources COPY preprocessing/other-scripts ./other-scripts RUN mkdir -p /var/log/headstart && touch /var/log/headstart/headstart.log -COPY workers/redis_config.json . - -COPY workers/*.py ./ +COPY workers/base/*.py ./ ENTRYPOINT python3 run_base.py diff --git a/server/workers/run_base.py b/server/workers/base/run_base.py similarity index 64% rename from server/workers/run_base.py rename to server/workers/base/run_base.py index 2dcd4c7bf..1c212c6e1 100644 --- a/server/workers/run_base.py +++ b/server/workers/base/run_base.py @@ -5,8 +5,12 @@ if __name__ == '__main__': - with open("redis_config.json") as infile: - redis_config = json.load(infile) + redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") + } redis_store = redis.StrictRedis(**redis_config) wrapper = BaseClient("./other-scripts", "run_base.R", redis_store, diff --git a/server/workers/build_docker_images.sh b/server/workers/build_docker_images.sh new file mode 100755 index 000000000..b32b015b7 --- /dev/null +++ b/server/workers/build_docker_images.sh @@ -0,0 +1,7 @@ +#!/bin/bash +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +services=("api" "persistence" "triple" "gsheets" "dataprocessing" "base" "pubmed" "openaire") +for service in ${services[@]}; do + docker build -f "$SCRIPT_DIR/../workers/$service/Dockerfile" -t "$service:`git rev-parse HEAD`" "$SCRIPT_DIR/../" +done + diff --git a/server/dataprocessing.docker b/server/workers/dataprocessing/Dockerfile similarity index 98% rename from server/dataprocessing.docker rename to server/workers/dataprocessing/Dockerfile index 53db85737..c527eb800 100644 --- a/server/dataprocessing.docker +++ b/server/workers/dataprocessing/Dockerfile @@ -159,7 +159,5 @@ COPY preprocessing/resources ./resources COPY preprocessing/other-scripts ./other-scripts RUN mkdir -p /var/log/headstart && touch /var/log/headstart/headstart.log -COPY workers/redis_config.json . - -COPY workers/*.py ./ +COPY workers/dataprocessing/*.py ./ ENTRYPOINT python3 run_dataprocessing.py diff --git a/server/workers/run_dataprocessing.py b/server/workers/dataprocessing/run_dataprocessing.py similarity index 66% rename from server/workers/run_dataprocessing.py rename to server/workers/dataprocessing/run_dataprocessing.py index 3ac7eb2e9..7f299c873 100644 --- a/server/workers/run_dataprocessing.py +++ b/server/workers/dataprocessing/run_dataprocessing.py @@ -5,8 +5,12 @@ if __name__ == '__main__': - with open("redis_config.json") as infile: - redis_config = json.load(infile) + redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") + } redis_store = redis.StrictRedis(**redis_config) dp = Dataprocessing("./other-scripts", "run_vis_layout.R", diff --git a/server/workers/dataprocessing/src/headstart.py b/server/workers/dataprocessing/src/headstart.py index 1568e460e..dbeddcc86 100644 --- a/server/workers/dataprocessing/src/headstart.py +++ b/server/workers/dataprocessing/src/headstart.py @@ -84,8 +84,8 @@ def run(self): res = self.create_map(params, input_data) self.redis_store.set(k+"_output", json.dumps(res)) except Exception as e: - self.logger.error(e) self.logger.error(params) + self.logger.error(e, exc_info=True) res = {} res["id"] = k res["params"] = params diff --git a/server/workers/example_redis_config.json b/server/workers/example_redis_config.json deleted file mode 100644 index a4ede8678..000000000 --- a/server/workers/example_redis_config.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "host":"127.0.0.1", - "port": 6379, - "db":0, - "password":"long_secure_password" -} diff --git a/server/workers/flavorconfigs/example.env b/server/workers/flavorconfigs/example.env new file mode 100644 index 000000000..1e27b993a --- /dev/null +++ b/server/workers/flavorconfigs/example.env @@ -0,0 +1,16 @@ +COMPOSE_PROJECT_NAME=stable +SERVICE_VERSION=9babf3e4dc8d143454f1a2f4dd40732b2c4f4f38 +NETWORK=dockerrefactoring +POSTGRES_DB=postgres +POSTGRES_USER=headstart +POSTGRES_PASSWORD=testpassword +POSTGRES_HOST=stable_db_1 +POSTGRES_PORT=5432 +POSTGRES_HOSTNAME=headstart_pgsql_1 +PGADMIN_DEFAULT_EMAIL=christopher.kittel@openknowledgemaps.org +PGADMIN_DEFAULT_PASSWORD=testpassword +API_PORT=5001 +REDIS_HOST=stable_redis_1 +REDIS_PORT=6379 +REDIS_DB=0 +REDIS_PASSWORD=redis_password \ No newline at end of file diff --git a/server/search_gsheets.docker b/server/workers/gsheets/Dockerfile similarity index 85% rename from server/search_gsheets.docker rename to server/workers/gsheets/Dockerfile index c80187617..5ad41de7f 100644 --- a/server/search_gsheets.docker +++ b/server/workers/gsheets/Dockerfile @@ -9,8 +9,7 @@ WORKDIR /headstart COPY workers/gsheets/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt COPY workers/gsheets/src/ ./gsheets/src -COPY workers/run_gsheets.py . +COPY workers/gsheets/run_gsheets.py . COPY workers/gsheets/token.pickle ./gsheets -COPY workers/redis_config.json . ENTRYPOINT python run_gsheets.py diff --git a/server/workers/run_gsheets.py b/server/workers/gsheets/run_gsheets.py similarity index 57% rename from server/workers/run_gsheets.py rename to server/workers/gsheets/run_gsheets.py index e7077afb7..66f93c098 100644 --- a/server/workers/run_gsheets.py +++ b/server/workers/gsheets/run_gsheets.py @@ -5,9 +5,12 @@ if __name__ == '__main__': - with open("redis_config.json") as infile: - redis_config = json.load(infile) - + redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") + } redis_store = redis.StrictRedis(**redis_config) gc = GSheetsClient(redis_store, os.environ.get("GSHEETS_LOGLEVEL", "INFO")) gc.run() diff --git a/server/openaire.docker b/server/workers/openaire/Dockerfile similarity index 98% rename from server/openaire.docker rename to server/workers/openaire/Dockerfile index 4a15cc2f5..ffac40e61 100644 --- a/server/openaire.docker +++ b/server/workers/openaire/Dockerfile @@ -156,7 +156,5 @@ COPY preprocessing/resources ./resources COPY preprocessing/other-scripts ./other-scripts RUN mkdir -p /var/log/headstart && touch /var/log/headstart/headstart.log -COPY workers/redis_config.json . - -COPY workers/*.py ./ +COPY workers/openaire/*.py ./ ENTRYPOINT python3 run_openaire.py diff --git a/server/workers/run_openaire.py b/server/workers/openaire/run_openaire.py similarity index 65% rename from server/workers/run_openaire.py rename to server/workers/openaire/run_openaire.py index 2f17fe605..45f9059ef 100644 --- a/server/workers/run_openaire.py +++ b/server/workers/openaire/run_openaire.py @@ -5,9 +5,12 @@ if __name__ == '__main__': - with open("redis_config.json") as infile: - redis_config = json.load(infile) - + redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") + } redis_store = redis.StrictRedis(**redis_config) wrapper = OpenAIREClient("./other-scripts", "run_openaire.R", redis_store, "english", diff --git a/server/workers/persistence/Dockerfile b/server/workers/persistence/Dockerfile new file mode 100644 index 000000000..158008010 --- /dev/null +++ b/server/workers/persistence/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.6.10-slim + +MAINTAINER Chris Kittel "christopher.kittel@openknowledgemaps.org" + +RUN apt-get update +RUN apt-get install -y --no-install-recommends gcc +RUN apt-get install -y --no-install-recommends git + +WORKDIR /persistence +COPY workers/persistence/requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt +RUN pip install git+https://github.com/python-restx/flask-restx +COPY workers/persistence/src/ ./ \ No newline at end of file diff --git a/server/workers/persistence/requirements.txt b/server/workers/persistence/requirements.txt new file mode 100644 index 000000000..78e0d05ad --- /dev/null +++ b/server/workers/persistence/requirements.txt @@ -0,0 +1,7 @@ +flask +flask-cors +flask_sqlalchemy +flask_restx +Werkzeug +gunicorn +psycopg2-binary diff --git a/server/workers/services/src/apis/persistence.py b/server/workers/persistence/src/apis/persistence.py similarity index 97% rename from server/workers/services/src/apis/persistence.py rename to server/workers/persistence/src/apis/persistence.py index 355af9bbf..639c3f28b 100644 --- a/server/workers/services/src/apis/persistence.py +++ b/server/workers/persistence/src/apis/persistence.py @@ -1,3 +1,4 @@ +import os from hashlib import md5 from datetime import datetime import json @@ -7,7 +8,7 @@ from models import Revisions, Visualizations from database import sessions -from config import settings +import settings persistence_ns = Namespace("persistence", description="OKMAps persistence operations") @@ -293,11 +294,11 @@ def post(self, database): headers) -@persistence_ns.route('/createID') +@persistence_ns.route('/createID/') class createID(Resource): @persistence_ns.produces(["application/json"]) - def post(self): + def post(self, database): try: persistence_ns.logger.debug("createID") payload = request.get_json() @@ -316,3 +317,10 @@ def post(self): result = {'success': False, 'reason': e} headers = {'ContentType': 'application/json'} return make_response(jsonify(result), 500, headers) + + +@persistence_ns.route('/service_version') +class ServiceVersion(Resource): + def get(self): + result = {"service_version": os.getenv("SERVICE_VERSION")} + return make_response(result, 200, {"Content-Type": "application/json"}) \ No newline at end of file diff --git a/server/workers/persistence/src/app.py b/server/workers/persistence/src/app.py new file mode 100644 index 000000000..1968d1d10 --- /dev/null +++ b/server/workers/persistence/src/app.py @@ -0,0 +1,73 @@ +import os +import sys +from flask import Flask +from flask_restx import Api +from flask_cors import CORS +from werkzeug.middleware.proxy_fix import ProxyFix + +from apis.persistence import persistence_ns +import settings +import logging + +class ReverseProxied(object): + '''Wrap the application in this middleware and configure the + front-end server to add these headers, to let you quietly bind + this to a URL other than / and to an HTTP scheme that is + different than what is used locally. + + location /myprefix { + proxy_pass http://192.168.0.1:5001; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Scheme $scheme; + proxy_set_header X-Script-Name /myprefix; + } + + :param app: the WSGI application + ''' + def __init__(self, app): + self.app = app + + def __call__(self, environ, start_response): + script_name = environ.get('HTTP_X_SCRIPT_NAME', '') + if script_name: + environ['SCRIPT_NAME'] = script_name + path_info = environ['PATH_INFO'] + if path_info.startswith(script_name): + environ['PATH_INFO'] = path_info[len(script_name):] + + scheme = environ.get('HTTP_X_SCHEME', '') + if scheme: + environ['wsgi.url_scheme'] = scheme + return self.app(environ, start_response) + + +def api_patches(app, settings): + api_fixed = Api( + app, + title="Head Start API", + description="Head Start API demo", + version="0.1", + prefix='/api', + doc="/docs") + if settings.BEHIND_PROXY: + api_fixed.behind_proxy = True + return api_fixed + + +app = Flask('v1', instance_relative_config=True) +app.config.from_object('settings') +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(app.logger.level) +app.wsgi_app = ProxyFix(app.wsgi_app, x_proto=1, x_port=1, x_for=1, x_host=1, x_prefix=1) +app.wsgi_app = ReverseProxied(app.wsgi_app) +CORS(app, expose_headers=["Content-Disposition", "Access-Control-Allow-Origin"]) + +api = api_patches(app, settings) +api.add_namespace(persistence_ns, path='/persistence') +app.logger.debug(app.config) +app.logger.debug(app.url_map) + + +if __name__ == '__main__': + app.run(host="127.0.0.1", port=5001, debug=True) diff --git a/server/workers/persistence/src/database.py b/server/workers/persistence/src/database.py new file mode 100644 index 000000000..68681344a --- /dev/null +++ b/server/workers/persistence/src/database.py @@ -0,0 +1,30 @@ +import os +from sqlalchemy.orm import sessionmaker +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +import settings + + +bind_params = { + "user": os.getenv("POSTGRES_USER"), + "pw": os.getenv("POSTGRES_PASSWORD"), + "host": os.getenv("POSTGRES_HOST"), + "port": os.getenv("POSTGRES_PORT"), + "db": settings.DEFAULT_DATABASE +} + +sessions = {} +sessions[settings.DEFAULT_DATABASE] = sessionmaker(bind=create_engine('postgresql://%(user)s:%(pw)s@%(host)s:%(port)s/%(db)s' % bind_params, + max_overflow=15, + pool_pre_ping=True, + pool_recycle=3600, + pool_size=30)) +for database in settings.DATABASES: + bind_params["db"] = database + sessions[database] = sessionmaker(bind=create_engine('postgresql://%(user)s:%(pw)s@%(host)s:%(port)s/%(db)s' % bind_params, + max_overflow=15, + pool_pre_ping=True, + pool_recycle=3600, + pool_size=30 + )) +Base = declarative_base() \ No newline at end of file diff --git a/server/workers/persistence/src/example_settings.py b/server/workers/persistence/src/example_settings.py new file mode 100644 index 000000000..423d54eef --- /dev/null +++ b/server/workers/persistence/src/example_settings.py @@ -0,0 +1,6 @@ +BEHIND_PROXY = True +SWAGGER_BASEPATH = "" +DEFAULT_DATABASE = "dev" +DATABASES = ["test"] +ENV = "development" +DEBUG = True diff --git a/server/workers/services/src/models.py b/server/workers/persistence/src/models.py similarity index 100% rename from server/workers/services/src/models.py rename to server/workers/persistence/src/models.py diff --git a/server/workers/proxy/docker-compose.yml b/server/workers/proxy/docker-compose.yml new file mode 100644 index 000000000..41b99533a --- /dev/null +++ b/server/workers/proxy/docker-compose.yml @@ -0,0 +1,19 @@ +version: '3.7' + +services: + + proxy: + image: 'nginx' + volumes: + - ./templates:/etc/nginx/templates + environment: + - NGINX_PORT=80 + ports: + - '8080:80' + networks: + - stable_headstart + +networks: + stable_headstart: + external: true + name: stable_headstart \ No newline at end of file diff --git a/server/workers/proxy/templates/default.conf.template b/server/workers/proxy/templates/default.conf.template new file mode 100644 index 000000000..1e459c450 --- /dev/null +++ b/server/workers/proxy/templates/default.conf.template @@ -0,0 +1,11 @@ +server { + listen ${NGINX_PORT}; + + location /stable/ { + proxy_pass http://stable_api_1:5001/api/; + + location /stable/persistence/ { + proxy_pass http://stable_persistence_1:5001/api/persistence/; + } + } +} \ No newline at end of file diff --git a/server/pubmed.docker b/server/workers/pubmed/Dockerfile similarity index 98% rename from server/pubmed.docker rename to server/workers/pubmed/Dockerfile index ec3311706..1b3fca43e 100644 --- a/server/pubmed.docker +++ b/server/workers/pubmed/Dockerfile @@ -156,7 +156,5 @@ COPY preprocessing/resources ./resources COPY preprocessing/other-scripts ./other-scripts RUN mkdir -p /var/log/headstart && touch /var/log/headstart/headstart.log -COPY workers/redis_config.json . - -COPY workers/*.py ./ +COPY workers/pubmed/*.py ./ ENTRYPOINT python3 run_pubmed.py diff --git a/server/workers/run_pubmed.py b/server/workers/pubmed/run_pubmed.py similarity index 64% rename from server/workers/run_pubmed.py rename to server/workers/pubmed/run_pubmed.py index f2d6da2de..3662d7158 100644 --- a/server/workers/run_pubmed.py +++ b/server/workers/pubmed/run_pubmed.py @@ -5,9 +5,12 @@ if __name__ == '__main__': - with open("redis_config.json") as infile: - redis_config = json.load(infile) - + redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") + } redis_store = redis.StrictRedis(**redis_config) wrapper = PubMedClient("./other-scripts", "run_pubmed.R", redis_store, "english", diff --git a/server/workers/run_triple.py b/server/workers/run_triple.py deleted file mode 100644 index 9b8700615..000000000 --- a/server/workers/run_triple.py +++ /dev/null @@ -1,15 +0,0 @@ -import os -import json -import redis -from triple.src.search_triple import TripleClient - - -if __name__ == '__main__': - with open("es_config.json") as infile: - es_config = json.load(infile) - with open("redis_config.json") as infile: - redis_config = json.load(infile) - - redis_store = redis.StrictRedis(**redis_config) - tc = TripleClient(es_config, redis_store, os.environ.get("TRIPLE_LOGLEVEL", "INFO")) - tc.run() diff --git a/server/workers/services/src/config/example_settings.py b/server/workers/services/src/config/example_settings.py deleted file mode 100644 index ab4375309..000000000 --- a/server/workers/services/src/config/example_settings.py +++ /dev/null @@ -1,22 +0,0 @@ -BEHIND_PROXY = True -DEFAULT = { - 'user': 'user', - 'pw': 'pw', - 'db': 'dev', - 'host': '127.0.0.1', - 'port': '5432', -} -TEST = { - 'user': 'testuser', - 'pw': 'testpassword', - 'db': 'test', - 'host': '127.0.0.1', - 'port': '5432', -} -SQLALCHEMY_DATABASE_URI = 'postgresql://%(user)s:%(pw)s@%(host)s:%(port)s/%(db)s' % DEFAULT -SQLALCHEMY_BINDS = { - 'test': 'postgresql://%(user)s:%(pw)s@%(host)s:%(port)s/%(db)s' % TEST -} -SQLALCHEMY_TRACK_MODIFICATIONS = False -ENV = "development" -DEBUG = True diff --git a/server/workers/services/src/config/swagger.json b/server/workers/services/src/config/swagger.json deleted file mode 100644 index 034adf72a..000000000 --- a/server/workers/services/src/config/swagger.json +++ /dev/null @@ -1 +0,0 @@ -{"swagger": "2.0", "basePath": "/api", "paths": {"/triple/mappings": {"get": {"responses": {"200": {"description": "OK"}, "400": {"description": "Invalid search parameters"}}, "operationId": "get_mappings", "parameters": [{"description": "Specify the ElasticSearch index to get the mapping of, currently either 'isidore-sources-triple' or 'isidore-documents-triple'", "name": "index", "type": "string", "in": "query"}], "tags": ["triple"]}}, "/triple/search": {"post": {"responses": {"200": {"description": "OK"}, "400": {"description": "Invalid search parameters"}}, "operationId": "post_search", "parameters": [{"name": "payload", "required": true, "in": "body", "schema": {"$ref": "#/definitions/SearchQuery"}}], "produces": ["application/json", "text/csv"], "tags": ["triple"]}}}, "info": {"title": "Head Start API", "version": "0.1", "description": "Head Start API demo"}, "produces": ["application/json"], "consumes": ["application/json"], "tags": [{"name": "triple", "description": "TRIPLE API operations"}], "definitions": {"SearchQuery": {"required": ["from", "q", "sorting", "to", "vis_type"], "properties": {"q": {"type": "string", "description": "query string", "example": "feminicide"}, "sorting": {"type": "string", "description": "most-relevant or most-recent", "example": "most-recent"}, "from": {"type": "string", "description": "yyyy-MM-dd", "example": "2019-01-01"}, "to": {"type": "string", "description": "yyyy-MM-dd", "example": "2019-12-31"}, "vis_type": {"type": "string", "description": "overview or timeline", "example": "overview"}, "raw": {"type": "boolean", "description": "raw results from ElasticSearch", "example": "false"}}, "type": "object"}}, "responses": {"ParseError": {"description": "When a mask can't be parsed"}, "MaskError": {"description": "When any error occurs on mask"}}} diff --git a/server/workers/services/src/database.py b/server/workers/services/src/database.py deleted file mode 100644 index 694f7ea2c..000000000 --- a/server/workers/services/src/database.py +++ /dev/null @@ -1,16 +0,0 @@ -from sqlalchemy.orm import sessionmaker -from sqlalchemy import create_engine -from sqlalchemy.ext.declarative import declarative_base -from config import settings - - -sessions = {} -sessions[settings.DEFAULT["db"]] = sessionmaker(bind=create_engine(settings.SQLALCHEMY_DATABASE_URI)) -for data_integration, database in settings.SQLALCHEMY_BINDS.items(): - sessions[data_integration] = sessionmaker(bind=create_engine(database, - max_overflow=15, - pool_pre_ping=True, - pool_recycle=3600, - pool_size=30 - )) -Base = declarative_base() \ No newline at end of file diff --git a/server/workers/services/src/manage.py b/server/workers/services/src/manage.py deleted file mode 100644 index f3de767c2..000000000 --- a/server/workers/services/src/manage.py +++ /dev/null @@ -1,16 +0,0 @@ -from app import app -from models import Visualizations, Revisions -from database import Base, sessions - - -if __name__ == '__main__': - with app.app_context(): - for database, Session in sessions.items(): - try: - session = Session() - engine = session.get_bind() - for name, table in Base.metadata.tables.items(): - if not engine.dialect.has_table(engine, name): - table.create(engine) - except Exception as e: - print(database, e) diff --git a/server/workers/services/src/utils/__init__.py b/server/workers/services/src/utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/server/workers/tests/test_helpers.py b/server/workers/tests/test_helpers.py index bba33d324..3da202c9b 100644 --- a/server/workers/tests/test_helpers.py +++ b/server/workers/tests/test_helpers.py @@ -10,8 +10,12 @@ from .conftest import RANDOM from ..services.src.apis.utils import get_key -with open("redis_config.json") as infile: - redis_config = json.load(infile) +redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") +} redis_store = redis.StrictRedis(**redis_config) diff --git a/server/search_triple.docker b/server/workers/triple/Dockerfile similarity index 78% rename from server/search_triple.docker rename to server/workers/triple/Dockerfile index 50e49454e..df9cef1d3 100644 --- a/server/search_triple.docker +++ b/server/workers/triple/Dockerfile @@ -10,8 +10,6 @@ COPY workers/triple/requirements.txt . RUN pip install --no-cache-dir -r requirements.txt RUN python -m spacy download xx_ent_wiki_sm COPY workers/triple/src/ ./triple/src -COPY workers/run_triple.py . -COPY workers/triple/es_config.json . -COPY workers/redis_config.json . +COPY workers/triple/run_triple.py . ENTRYPOINT python run_triple.py diff --git a/server/workers/triple/example_es_config.json b/server/workers/triple/example_es_config.json deleted file mode 100644 index 81a0fc96b..000000000 --- a/server/workers/triple/example_es_config.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "user": "string", - "pass": "string", - "host": "string", - "port": 9200, - "indices": {"string":"string"} -} diff --git a/server/workers/triple/example_triple.env b/server/workers/triple/example_triple.env index 8462b8340..e055198c4 100644 --- a/server/workers/triple/example_triple.env +++ b/server/workers/triple/example_triple.env @@ -1 +1,8 @@ TRIPLE_LOGLEVEL=DEBUG +TRIPLE_USER=username +TRIPLE_PASS=password +TRIPLE_HOST=host.name +TRIPLE_PORT=9200 +TRIPLE_DOCUMENTS_INDEX=string +TRIPLE_PROJECTS_INDEX=string +TRIPLE_AUTHORS_INDEX=string \ No newline at end of file diff --git a/server/workers/triple/run_triple.py b/server/workers/triple/run_triple.py new file mode 100644 index 000000000..227663fea --- /dev/null +++ b/server/workers/triple/run_triple.py @@ -0,0 +1,23 @@ +import os +import json +import redis +from triple.src.search_triple import TripleClient + + +if __name__ == '__main__': + es_config = { + "user": os.getenv("TRIPLE_USER"), + "pass": os.getenv("TRIPLE_PASS"), + "host": os.getenv("TRIPLE_HOST"), + "port": os.getenv("TRIPLE_PORT") + } + redis_config = { + "host": os.getenv("REDIS_HOST"), + "port": os.getenv("REDIS_PORT"), + "db": os.getenv("REDIS_DB"), + "password": os.getenv("REDIS_PASSWORD") + } + + redis_store = redis.StrictRedis(**redis_config) + tc = TripleClient(es_config, redis_store, os.environ.get("TRIPLE_LOGLEVEL", "INFO")) + tc.run() diff --git a/server/workers/triple/src/search_triple.py b/server/workers/triple/src/search_triple.py index 0f11dc6be..b19dec233 100644 --- a/server/workers/triple/src/search_triple.py +++ b/server/workers/triple/src/search_triple.py @@ -1,3 +1,4 @@ +import os import sys import re import json @@ -101,8 +102,8 @@ def build_body(self, parameters): body["query"]["bool"]["must"].append({"term": {"language": parameters.get('language')}}) return body - def search(self, parameters): - index = "triple-poc-document27032021" + def search_documents(self, parameters): + index = os.getenv("TRIPLE_DOCUMENTS_INDEX") fields = ["headline.text", "abstract.text"] s = Search(using=self.es, index=index) # TODO: replace from parameters @@ -125,9 +126,9 @@ def search(self, parameters): if parameters.get('raw') is True: return result.to_dict() else: - return self.process_result(result, parameters) + return self.process_documents(result, parameters) - def process_result(self, result, parameters): + def process_documents(self, result, parameters): """ # * "id": a unique ID, preferably the DOI # * "title": the title @@ -248,7 +249,7 @@ def run(self): try: res = {} res["id"] = k - res["input_data"] = self.search(parameters) + res["input_data"] = self.search_documents(parameters) res["params"] = parameters res["status"] = "success" if parameters.get('raw') is True: