diff --git a/doc/release-notes/6510-duplicate-datafiles-and-datatables.md b/doc/release-notes/6510-duplicate-datafiles-and-datatables.md new file mode 100644 index 00000000000..18ac58860d8 --- /dev/null +++ b/doc/release-notes/6510-duplicate-datafiles-and-datatables.md @@ -0,0 +1,22 @@ +We recently discovered a *potential* data integrity issue in +Dataverse databases. One manifests itself as duplicate DataFile +objects created for the same uploaded file (https://github.com/IQSS/dataverse/issues/6522); the other as duplicate +DataTable (tabular metadata) objects linked to the same +DataFile (https://github.com/IQSS/dataverse/issues/6510). This issue impacted approximately .03% of datasets in Harvard's Dataverse. + +To see if any datasets in your installation have been impacted by this data integrity issue, we've provided a diagnostic script here: + +https://github.com/IQSS/dataverse/raw/develop/scripts/issues/6510/check_datafiles_6522_6510.sh + +The script relies on the PostgreSQL utility psql to access the +database. You will need to edit the credentials at the top of the script +to match your database configuration. + +If neither of the two issues is present in your database, you will see +a message "... no duplicate DataFile objects in your database" and "no +tabular files affected by this issue in your database". + +If either, or both kinds of duplicates are detected, the script will +provide further instructions. We will need you to send us the produced +output. We will then assist you in resolving the issues in your +database. diff --git a/doc/release-notes/6522-datafile-duplicates.md b/doc/release-notes/6522-datafile-duplicates.md deleted file mode 100644 index 39abb49cd69..00000000000 --- a/doc/release-notes/6522-datafile-duplicates.md +++ /dev/null @@ -1,27 +0,0 @@ -In this Dataverse release, we are adding a database constraint to -prevent duplicate DataFile objects pointing to the same physical file -from being created. - -Before this release can be deployed, your database must be checked -for any such duplicates that may already exist. If present, -the duplicates will need to be deleted, and the integrity of the -stored physical files verified. - -(We have notified the community about this issue ahead of the release, -so you may have already addressed it. In this case, please disregard -this release note) - -Please run the diagnostic script provided at -https://github.com/IQSS/dataverse/raw/develop/scripts/issues/6522/find_duplicates.sh. -The script relies on the PostgreSQL utility `psql` to access the -database. You will need to edit the credentials at the top of the script -to match your database configuration. - -If this issue is not present in your database, you will see a message -`... no duplicate dvObjects in your database. Your installation is -ready to be upgraded to Dataverse 4.20`. - -If duplicates are detected, it will provide further instructions. We -will need you to send us the produced output. We will then assist you -in resolving this problem in your database. - diff --git a/scripts/issues/6510/PRE-RELEASE-INFO.txt b/scripts/issues/6510/PRE-RELEASE-INFO.txt new file mode 100644 index 00000000000..7b71581eb8f --- /dev/null +++ b/scripts/issues/6510/PRE-RELEASE-INFO.txt @@ -0,0 +1,22 @@ +We recently discovered a *potential* data integrity issue in +Dataverse databases. One manifests itself as duplicate DataFile +objects created for the same uploaded file (https://github.com/IQSS/dataverse/issues/6522); the other as duplicate +DataTable (tabular metadata) objects linked to the same +DataFile (https://github.com/IQSS/dataverse/issues/6510). This issue impacted approximately .03% of datasets in Harvard's Dataverse. + +To see if any datasets in your installation have been impacted by this data integrity issue, we've provided a diagnostic script here: + +https://github.com/IQSS/dataverse/raw/develop/scripts/issues/6510/check_datafiles_6522_6510.sh + +The script relies on the PostgreSQL utility psql to access the +database. You will need to edit the credentials at the top of the script +to match your database configuration. + +If neither of the two issues is present in your database, you will see +a message "... no duplicate DataFile objects in your database" and "no +tabular files affected by this issue in your database". + +If either, or both kinds of duplicates are detected, the script will +provide further instructions. We will need you to send us the produced +output to support@dataverse.org and we will assist you in resolving the issues in your +database. diff --git a/scripts/issues/6510/check_datafiles_6522_6510.sh b/scripts/issues/6510/check_datafiles_6522_6510.sh new file mode 100755 index 00000000000..15bc7e80b5a --- /dev/null +++ b/scripts/issues/6510/check_datafiles_6522_6510.sh @@ -0,0 +1,131 @@ +#!/bin/bash + +# begin config +# PostgresQL credentials: +# edit the following lines so that psql can talk to your database +pg_host=localhost +pg_port=5432 +pg_user=dvnapp +pg_db=dvndb +# you can leave the password blank, if Postgres is configured +# to accept connections without auth: +pg_pass= +# psql executable - add full path, if necessary: +PSQL_EXEC=psql + +# end config + +# first issue, duplicate datafiles: (#6522) + +PG_QUERY_0="SELECT COUNT(DISTINCT o.id) FROM datafile f, dataset s, dvobject p, dvobject o WHERE s.id = p.id AND o.id = f.id AND o.owner_id = s.id AND s.harvestingclient_id IS null AND o.storageidentifier IS NOT null" + +PG_QUERY_1="SELECT s.id, o.storageidentifier FROM datafile f, dataset s, dvobject o WHERE o.id = f.id AND o.owner_id = s.id AND s.harvestingclient_id IS null AND o.storageidentifier IS NOT null ORDER by o.storageidentifier" + +PG_QUERY_2="SELECT p.authority, p.identifier, o.storageidentifier, o.id, o.createdate, f.contenttype FROM datafile f, dvobject p, dvobject o WHERE o.id = f.id AND o.owner_id = p.id AND o.storageidentifier='%s' ORDER by o.id" + +PGPASSWORD=$pg_pass; export PGPASSWORD + +echo "Checking the number of non-harvested datafiles in the database..." + +NUM_DATAFILES=`${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_0}"` +if [ $? != 0 ] +then + echo "FAILED to execute psql! Check the credentials and try again?" + echo "exiting..." + echo + echo "the command line that failed:" + echo "${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c \"${PG_QUERY_0}\"" + exit 1 +fi + +echo $NUM_DATAFILES total. +echo + +echo "Let's check if any storage identifiers are referenced more than once within the same dataset:" + +${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_1}" | +uniq -c | +awk '{if ($1 > 1) print $NF}' > /tmp/storageidentifiers.tmp + +NUM_CONFIRMED=`cat /tmp/storageidentifiers.tmp | wc -l` + +if [ $NUM_CONFIRMED == 0 ] +then + echo + echo "Good news - it appears that there are NO duplicate DataFile objects in your database." + echo "Your installation is ready to be upgraded to Dataverse 4.20." + echo +else + + echo "The following storage identifiers appear to be referenced from multiple DvObjects:" + cat /tmp/storageidentifiers.tmp + echo "(output saved in /tmp/storageidentifiers.tmp)" + + echo "Looking up details for the affected datafiles:" + + cat /tmp/storageidentifiers.tmp | while read si + do + PG_QUERY_SI=`printf "${PG_QUERY_2}" $si` + ${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_SI}" + done | tee /tmp/duplicates_info.tmp + + echo "(output saved in /tmp/duplicates_info.tmp)" + + echo + echo "Please send the output above to Dataverse support at support@dataverse.org." + echo "We will assist you in the process of cleaning up the affected files above." + echo "We apologize for any inconvenience." + echo +fi + +# second issue, repeated ingests: (issue #6510) + +PG_QUERY_3="SELECT COUNT(DISTINCT o.id) FROM datafile f, dataset s, dvobject o, datatable t WHERE o.id = f.id AND o.owner_id = s.id AND t.datafile_id = f.id AND s.harvestingclient_id IS null AND o.storageidentifier IS NOT null" + +PG_QUERY_4="SELECT t.id, f.id FROM datafile f, dataset s, dvobject o, datatable t WHERE o.id = f.id AND o.owner_id = s.id AND t.datafile_id = f.id AND s.harvestingclient_id IS null AND o.storageidentifier IS NOT null ORDER by f.id, t.id" + +PG_QUERY_5="SELECT p.authority, p.identifier, o.storageidentifier, o.id, t.id, o.createdate, f.contenttype, t.originalfileformat FROM datafile f, dvobject p, dvobject o, datatable t WHERE o.id = f.id AND t.datafile_id = f.id AND o.owner_id = p.id AND o.id='%s' ORDER by t.id" + +echo +echo "Checking the number of ingested (\"tabular\") datafiles in the database..." + +NUM_DATAFILES=`${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_3}"` + +echo $NUM_DATAFILES total. +echo + +echo "Let's check if any of these ingested files have MORE THAN ONE linked datatable objects:" + +${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_4}" | +uniq -c -f 1 | +awk '{if ($1 > 1) print $NF}' > /tmp/datafileids.tmp + +NUM_CONFIRMED=`cat /tmp/datafileids.tmp | wc -l` + +if [ $NUM_CONFIRMED == 0 ] +then + echo + echo "Good news - it appears that there are no tabular files affected by this issue in your database." + echo + exit 0 +fi + +echo "The following "${NUM_CONFIRMED}" DataFile ids appear to be referenced from multiple DataTables:" +cat /tmp/datafileids.tmp +echo "(output saved in /tmp/datafileids.tmp)" + +echo "Looking up details for the affected tabular files:" + +cat /tmp/datafileids.tmp | while read si +do + PG_QUERY_SI=`printf "${PG_QUERY_5}" $si` + ${PSQL_EXEC} -h ${pg_host} -U ${pg_user} -d ${pg_db} -tA -F ' ' -c "${PG_QUERY_SI}" +done | tee /tmp/multiple_ingests_info.tmp + +echo "(output saved in /tmp/multiple_ingests_info.tmp)" + +echo +echo "Please send the output above to Dataverse support at support@dataverse.org." +echo "We will assist you in fixing this issue in your Dataverse database." +echo "We apologize for any inconvenience." +echo