From 0f394eaf3894b633a864c67928be02f344756d49 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 20 Feb 2020 10:55:29 -0500 Subject: [PATCH 1/7] A diagnostics script to be sent to the remote installations. (this is a *compbined* script for BOTH #6510 and #6522!) --- .../issues/6510/check_datafiles_6522_6510.sh | 131 ++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100755 scripts/issues/6510/check_datafiles_6522_6510.sh diff --git a/scripts/issues/6510/check_datafiles_6522_6510.sh b/scripts/issues/6510/check_datafiles_6522_6510.sh new file mode 100755 index 00000000000..2cb873e92e1 --- /dev/null +++ b/scripts/issues/6510/check_datafiles_6522_6510.sh @@ -0,0 +1,131 @@ +#!/bin/sh + +# begin config +# PostgresQL credentials: +# edit the following lines so that psql can talk to your database +pg_host=localhost +pg_port=5432 +pg_user=dvnapp +pg_db=dvndb +# you can leave the password blank, if Postgres is configured +# to accept connections without auth: +pg_pass= +# psql executable - add full path, if necessary: +PSQL_EXEC=psql + +# end config + +# first issue, duplicate datafiles: (#6522) + +PG_QUERY_0="SELECT COUNT(DISTINCT o.id) FROM datafile f, dataset s, dvobject p, dvobject o WHERE s.id = p.id AND o.id = f.id AND o.owner_id = s.id AND s.harvestingclient_id IS null AND o.storageidentifier IS NOT null" + +PG_QUERY_1="SELECT s.id, o.storageidentifier FROM datafile f, dataset s, dvobject o WHERE o.id = f.id AND o.owner_id = s.id AND s.harvestingclient_id IS null AND o.storageidentifier IS NOT null ORDER by o.storageidentifier" + +PG_QUERY_2="SELECT p.authority, p.identifier, o.storageidentifier, o.id, o.createdate, f.contenttype FROM datafile f, dvobject p, dvobject o WHERE o.id = f.id AND o.owner_id = p.id AND o.storageidentifier='%s' ORDER by o.id" + +PGPASSWORD=$pg_pass; export PGPASSWORD + +echo "Checking the number of non-harvested datafiles in the database..." + +NUM_DATAFILES=`${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_0}"` +if [ $? != 0 ] +then + echo "FAILED to execute psql! Check the credentials and try again?" + echo "exiting..." + echo + echo "the command line that failed:" + echo "${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c \"${PG_QUERY_0}\"" + exit 1 +fi + +echo $NUM_DATAFILES total. +echo + +echo "Let's check if any storage identifiers are referenced more than once within the same dataset:" + +${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_1}" | +uniq -c | +awk '{if ($1 > 1) print $NF}' > /tmp/storageidentifiers.tmp + +NUM_CONFIRMED=`cat /tmp/storageidentifiers.tmp | wc -l` + +if [ $NUM_CONFIRMED == 0 ] +then + echo + echo "Good news - it appears that there are NO duplicate DataFile objects in your database." + echo "Your installation is ready to be upgraded to Dataverse 4.20." + echo +else + + echo "The following storage identifiers appear to be referenced from multiple DvObjects:" + cat /tmp/storageidentifiers.tmp + echo "(output saved in /tmp/storageidentifiers.tmp)" + + echo "Looking up details for the affected datafiles:" + + cat /tmp/storageidentifiers.tmp | while read si + do + PG_QUERY_SI=`printf "${PG_QUERY_2}" $si` + ${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_SI}" + done | tee /tmp/duplicates_info.tmp + + echo "(output saved in /tmp/duplicates_info.tmp)" + + echo + echo "Please send the output above to Dataverse support." + echo "We will assist you in the process of cleaning up the affected files above." + echo "We apologize for any inconvenience." + echo +fi + +# second issue, repeated ingests: (issue #6510) + +PG_QUERY_3="SELECT COUNT(DISTINCT o.id) FROM datafile f, dataset s, dvobject o, datatable t WHERE o.id = f.id AND o.owner_id = s.id AND t.datafile_id = f.id AND s.harvestingclient_id IS null AND o.storageidentifier IS NOT null" + +PG_QUERY_4="SELECT t.id, f.id FROM datafile f, dataset s, dvobject o, datatable t WHERE o.id = f.id AND o.owner_id = s.id AND t.datafile_id = f.id AND s.harvestingclient_id IS null AND o.storageidentifier IS NOT null ORDER by f.id, t.id" + +PG_QUERY_5="SELECT p.authority, p.identifier, o.storageidentifier, o.id, t.id, o.createdate, f.contenttype, t.originalfileformat FROM datafile f, dvobject p, dvobject o, datatable t WHERE o.id = f.id AND t.datafile_id = f.id AND o.owner_id = p.id AND o.id='%s' ORDER by t.id" + +echo +echo "Checking the number of ingested (\"tabular\") datafiles in the database..." + +NUM_DATAFILES=`${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_3}"` + +echo $NUM_DATAFILES total. +echo + +echo "Let's check if any of these ingested files have MORE THAN ONE linked datatable objects:" + +${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_4}" | +uniq -c -f 1 | +awk '{if ($1 > 1) print $NF}' > /tmp/datafileids.tmp + +NUM_CONFIRMED=`cat /tmp/datafileids.tmp | wc -l` + +if [ $NUM_CONFIRMED == 0 ] +then + echo + echo "Good news - it appears that there are no tabular files affected by this issue in your database." + echo + exit 0 +fi + +echo "The following "${NUM_CONFIRMED}" DataFile ids appear to be referenced from multiple DataTables:" +cat /tmp/datafileids.tmp +echo "(output saved in /tmp/datafileids.tmp)" + +echo "Looking up details for the affected tabular files:" + +cat /tmp/datafileids.tmp | while read si +do + PG_QUERY_SI=`printf "${PG_QUERY_5}" $si` + ${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_SI}" +done | tee /tmp/multiple_ingests_info.tmp + +echo "(output saved in /tmp/multiple_ingests_info.tmp)" + +echo +echo "Please send the output above to Dataverse support." +echo "We will assist you in fixing this issue in your Dataverse database." +echo "We apologize for any inconvenience." +echo From 9d6729b2a69e1de791c8c0994ba1fb798bad1bd0 Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Thu, 20 Feb 2020 11:13:03 -0500 Subject: [PATCH 2/7] "pre-release note" to be sent out to the remote installations. This (and the proper release note) SUPERCEDES what was in PR #6522! i.e. we are sending out only ONE note, not TWO, there's only one script to run, etc. (ref. #6510) --- ...6510-duplicate-datafiles-and-datatables.md | 24 +++++++++++++++++ doc/release-notes/6522-datafile-duplicates.md | 27 ------------------- scripts/issues/6510/PRE-RELEASE-INFO.txt | 24 +++++++++++++++++ 3 files changed, 48 insertions(+), 27 deletions(-) create mode 100644 doc/release-notes/6510-duplicate-datafiles-and-datatables.md delete mode 100644 doc/release-notes/6522-datafile-duplicates.md create mode 100644 scripts/issues/6510/PRE-RELEASE-INFO.txt diff --git a/doc/release-notes/6510-duplicate-datafiles-and-datatables.md b/doc/release-notes/6510-duplicate-datafiles-and-datatables.md new file mode 100644 index 00000000000..d0440374def --- /dev/null +++ b/doc/release-notes/6510-duplicate-datafiles-and-datatables.md @@ -0,0 +1,24 @@ +We recently discovered that *potential* data integrity issues in +Dataverse databases. One manifests itself as duplicate DataFile +objects created for the same uploaded file; the other as duplicate +DataTable (tabular metadata) objects linked to the same +DataFile. (GitHub issues https://github.com/IQSS/dataverse/issues/6522 +and https://github.com/IQSS/dataverse/issues/6510 respectively). + +Please run the diagnostic script provided at +https://github.com/IQSS/dataverse/raw/6510-repeated-ingests/scripts/issues/6510/check_datafiles_6522_6510.sh +[NOTE!! the branch name must be changed to "develop" in the URL above before we merge!!] + +The script relies on the PostgreSQL utility psql to access the +database. You will need to edit the credentials at the top of the script +to match your database configuration. + +If neither of the two issues is present in your database, you will see +a message "... no duplicate DataFile objects in your database" and "no +tabular files affected by this issue in your database". + +If either, or both kinds of duplicates are detected, the script will +provide further instructions. We will need you to send us the produced +output. We will then assist you in resolving the issues in your +database. + diff --git a/doc/release-notes/6522-datafile-duplicates.md b/doc/release-notes/6522-datafile-duplicates.md deleted file mode 100644 index 39abb49cd69..00000000000 --- a/doc/release-notes/6522-datafile-duplicates.md +++ /dev/null @@ -1,27 +0,0 @@ -In this Dataverse release, we are adding a database constraint to -prevent duplicate DataFile objects pointing to the same physical file -from being created. - -Before this release can be deployed, your database must be checked -for any such duplicates that may already exist. If present, -the duplicates will need to be deleted, and the integrity of the -stored physical files verified. - -(We have notified the community about this issue ahead of the release, -so you may have already addressed it. In this case, please disregard -this release note) - -Please run the diagnostic script provided at -https://github.com/IQSS/dataverse/raw/develop/scripts/issues/6522/find_duplicates.sh. -The script relies on the PostgreSQL utility `psql` to access the -database. You will need to edit the credentials at the top of the script -to match your database configuration. - -If this issue is not present in your database, you will see a message -`... no duplicate dvObjects in your database. Your installation is -ready to be upgraded to Dataverse 4.20`. - -If duplicates are detected, it will provide further instructions. We -will need you to send us the produced output. We will then assist you -in resolving this problem in your database. - diff --git a/scripts/issues/6510/PRE-RELEASE-INFO.txt b/scripts/issues/6510/PRE-RELEASE-INFO.txt new file mode 100644 index 00000000000..d0440374def --- /dev/null +++ b/scripts/issues/6510/PRE-RELEASE-INFO.txt @@ -0,0 +1,24 @@ +We recently discovered that *potential* data integrity issues in +Dataverse databases. One manifests itself as duplicate DataFile +objects created for the same uploaded file; the other as duplicate +DataTable (tabular metadata) objects linked to the same +DataFile. (GitHub issues https://github.com/IQSS/dataverse/issues/6522 +and https://github.com/IQSS/dataverse/issues/6510 respectively). + +Please run the diagnostic script provided at +https://github.com/IQSS/dataverse/raw/6510-repeated-ingests/scripts/issues/6510/check_datafiles_6522_6510.sh +[NOTE!! the branch name must be changed to "develop" in the URL above before we merge!!] + +The script relies on the PostgreSQL utility psql to access the +database. You will need to edit the credentials at the top of the script +to match your database configuration. + +If neither of the two issues is present in your database, you will see +a message "... no duplicate DataFile objects in your database" and "no +tabular files affected by this issue in your database". + +If either, or both kinds of duplicates are detected, the script will +provide further instructions. We will need you to send us the produced +output. We will then assist you in resolving the issues in your +database. + From 634541ab26cd646e2f7c939303b72dcb1e86a446 Mon Sep 17 00:00:00 2001 From: Danny Brooke Date: Fri, 21 Feb 2020 11:30:19 -0500 Subject: [PATCH 3/7] text edits --- .../6510-duplicate-datafiles-and-datatables.md | 12 +++++------- scripts/issues/6510/PRE-RELEASE-INFO.txt | 14 ++++++-------- scripts/issues/6510/check_datafiles_6522_6510.sh | 4 ++-- 3 files changed, 13 insertions(+), 17 deletions(-) diff --git a/doc/release-notes/6510-duplicate-datafiles-and-datatables.md b/doc/release-notes/6510-duplicate-datafiles-and-datatables.md index d0440374def..6182bd97162 100644 --- a/doc/release-notes/6510-duplicate-datafiles-and-datatables.md +++ b/doc/release-notes/6510-duplicate-datafiles-and-datatables.md @@ -1,13 +1,12 @@ -We recently discovered that *potential* data integrity issues in +We recently discovered a *potential* data integrity issue in Dataverse databases. One manifests itself as duplicate DataFile -objects created for the same uploaded file; the other as duplicate +objects created for the same uploaded file (https://github.com/IQSS/dataverse/issues/6522); the other as duplicate DataTable (tabular metadata) objects linked to the same -DataFile. (GitHub issues https://github.com/IQSS/dataverse/issues/6522 -and https://github.com/IQSS/dataverse/issues/6510 respectively). +DataFile (https://github.com/IQSS/dataverse/issues/6510). This issue impacted approximately .0003% of datasets in Harvard's Dataverse. + +To see if any datasets in your installation have been impacted by this data integrity issue, we've provided a diagnostic script here: -Please run the diagnostic script provided at https://github.com/IQSS/dataverse/raw/6510-repeated-ingests/scripts/issues/6510/check_datafiles_6522_6510.sh -[NOTE!! the branch name must be changed to "develop" in the URL above before we merge!!] The script relies on the PostgreSQL utility psql to access the database. You will need to edit the credentials at the top of the script @@ -21,4 +20,3 @@ If either, or both kinds of duplicates are detected, the script will provide further instructions. We will need you to send us the produced output. We will then assist you in resolving the issues in your database. - diff --git a/scripts/issues/6510/PRE-RELEASE-INFO.txt b/scripts/issues/6510/PRE-RELEASE-INFO.txt index d0440374def..2af6887571b 100644 --- a/scripts/issues/6510/PRE-RELEASE-INFO.txt +++ b/scripts/issues/6510/PRE-RELEASE-INFO.txt @@ -1,13 +1,12 @@ -We recently discovered that *potential* data integrity issues in +We recently discovered a *potential* data integrity issue in Dataverse databases. One manifests itself as duplicate DataFile -objects created for the same uploaded file; the other as duplicate +objects created for the same uploaded file (https://github.com/IQSS/dataverse/issues/6522); the other as duplicate DataTable (tabular metadata) objects linked to the same -DataFile. (GitHub issues https://github.com/IQSS/dataverse/issues/6522 -and https://github.com/IQSS/dataverse/issues/6510 respectively). +DataFile (https://github.com/IQSS/dataverse/issues/6510). This issue impacted approximately .03% of datasets in Harvard's Dataverse. + +To see if any datasets in your installation have been impacted by this data integrity issue, we've provided a diagnostic script here: -Please run the diagnostic script provided at https://github.com/IQSS/dataverse/raw/6510-repeated-ingests/scripts/issues/6510/check_datafiles_6522_6510.sh -[NOTE!! the branch name must be changed to "develop" in the URL above before we merge!!] The script relies on the PostgreSQL utility psql to access the database. You will need to edit the credentials at the top of the script @@ -19,6 +18,5 @@ tabular files affected by this issue in your database". If either, or both kinds of duplicates are detected, the script will provide further instructions. We will need you to send us the produced -output. We will then assist you in resolving the issues in your +output to support@dataverse.org and we will assist you in resolving the issues in your database. - diff --git a/scripts/issues/6510/check_datafiles_6522_6510.sh b/scripts/issues/6510/check_datafiles_6522_6510.sh index 2cb873e92e1..ce6e5db5b59 100755 --- a/scripts/issues/6510/check_datafiles_6522_6510.sh +++ b/scripts/issues/6510/check_datafiles_6522_6510.sh @@ -72,7 +72,7 @@ else echo "(output saved in /tmp/duplicates_info.tmp)" echo - echo "Please send the output above to Dataverse support." + echo "Please send the output above to Dataverse support at support@dataverse.org." echo "We will assist you in the process of cleaning up the affected files above." echo "We apologize for any inconvenience." echo @@ -125,7 +125,7 @@ done | tee /tmp/multiple_ingests_info.tmp echo "(output saved in /tmp/multiple_ingests_info.tmp)" echo -echo "Please send the output above to Dataverse support." +echo "Please send the output above to Dataverse support at support@dataverse.org." echo "We will assist you in fixing this issue in your Dataverse database." echo "We apologize for any inconvenience." echo From eed01cf8b67a6c6a487343d9e718e71841b67d0d Mon Sep 17 00:00:00 2001 From: Leonid Andreev Date: Fri, 21 Feb 2020 12:47:29 -0500 Subject: [PATCH 4/7] changed sh -> bash. (#6510) --- scripts/issues/6510/check_datafiles_6522_6510.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/issues/6510/check_datafiles_6522_6510.sh b/scripts/issues/6510/check_datafiles_6522_6510.sh index ce6e5db5b59..15bc7e80b5a 100755 --- a/scripts/issues/6510/check_datafiles_6522_6510.sh +++ b/scripts/issues/6510/check_datafiles_6522_6510.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/bin/bash # begin config # PostgresQL credentials: From f33a06368c31fe26e361e0186c85d95e291406ae Mon Sep 17 00:00:00 2001 From: Danny Brooke Date: Fri, 21 Feb 2020 13:16:38 -0500 Subject: [PATCH 5/7] updating to develop pre-merge --- doc/release-notes/6510-duplicate-datafiles-and-datatables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/release-notes/6510-duplicate-datafiles-and-datatables.md b/doc/release-notes/6510-duplicate-datafiles-and-datatables.md index 6182bd97162..680635f7e17 100644 --- a/doc/release-notes/6510-duplicate-datafiles-and-datatables.md +++ b/doc/release-notes/6510-duplicate-datafiles-and-datatables.md @@ -6,7 +6,7 @@ DataFile (https://github.com/IQSS/dataverse/issues/6510). This issue impacted ap To see if any datasets in your installation have been impacted by this data integrity issue, we've provided a diagnostic script here: -https://github.com/IQSS/dataverse/raw/6510-repeated-ingests/scripts/issues/6510/check_datafiles_6522_6510.sh +https://github.com/IQSS/dataverse/raw/develop/scripts/issues/6510/check_datafiles_6522_6510.sh The script relies on the PostgreSQL utility psql to access the database. You will need to edit the credentials at the top of the script From 9e3e8cf77e428b2922741eccb4c6a33c2ab47dfd Mon Sep 17 00:00:00 2001 From: Danny Brooke Date: Fri, 21 Feb 2020 13:17:31 -0500 Subject: [PATCH 6/7] update to develop pre-merge --- scripts/issues/6510/PRE-RELEASE-INFO.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/issues/6510/PRE-RELEASE-INFO.txt b/scripts/issues/6510/PRE-RELEASE-INFO.txt index 2af6887571b..7b71581eb8f 100644 --- a/scripts/issues/6510/PRE-RELEASE-INFO.txt +++ b/scripts/issues/6510/PRE-RELEASE-INFO.txt @@ -6,7 +6,7 @@ DataFile (https://github.com/IQSS/dataverse/issues/6510). This issue impacted ap To see if any datasets in your installation have been impacted by this data integrity issue, we've provided a diagnostic script here: -https://github.com/IQSS/dataverse/raw/6510-repeated-ingests/scripts/issues/6510/check_datafiles_6522_6510.sh +https://github.com/IQSS/dataverse/raw/develop/scripts/issues/6510/check_datafiles_6522_6510.sh The script relies on the PostgreSQL utility psql to access the database. You will need to edit the credentials at the top of the script From b710561b4b3ef1bce0b6fffc9867f59f46b7407d Mon Sep 17 00:00:00 2001 From: Danny Brooke Date: Fri, 21 Feb 2020 13:29:18 -0500 Subject: [PATCH 7/7] Correcting my incorrect math --- doc/release-notes/6510-duplicate-datafiles-and-datatables.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/release-notes/6510-duplicate-datafiles-and-datatables.md b/doc/release-notes/6510-duplicate-datafiles-and-datatables.md index 680635f7e17..18ac58860d8 100644 --- a/doc/release-notes/6510-duplicate-datafiles-and-datatables.md +++ b/doc/release-notes/6510-duplicate-datafiles-and-datatables.md @@ -2,7 +2,7 @@ We recently discovered a *potential* data integrity issue in Dataverse databases. One manifests itself as duplicate DataFile objects created for the same uploaded file (https://github.com/IQSS/dataverse/issues/6522); the other as duplicate DataTable (tabular metadata) objects linked to the same -DataFile (https://github.com/IQSS/dataverse/issues/6510). This issue impacted approximately .0003% of datasets in Harvard's Dataverse. +DataFile (https://github.com/IQSS/dataverse/issues/6510). This issue impacted approximately .03% of datasets in Harvard's Dataverse. To see if any datasets in your installation have been impacted by this data integrity issue, we've provided a diagnostic script here: