Skip to content
This repository was archived by the owner on Apr 26, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/4626.misc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Improve 'user_ips' table deduplication background update
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe a bugfix? shrug

63 changes: 60 additions & 3 deletions synapse/storage/client_ips.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,12 +167,16 @@ def remove(txn):
clause = "? <= last_seen AND last_seen < ?"
args = (begin_last_seen, end_last_seen)

# (Note: The DISTINCT in the inner query is important to ensure that
# the COUNT(*) is accurate, otherwise double counting may happen due
# to the join effectively being a cross product)
txn.execute(
"""
SELECT user_id, access_token, ip,
MAX(device_id), MAX(user_agent), MAX(last_seen)
MAX(device_id), MAX(user_agent), MAX(last_seen),
COUNT(*)
FROM (
SELECT user_id, access_token, ip
SELECT DISTINCT user_id, access_token, ip
FROM user_ips
WHERE {}
) c
Expand All @@ -186,7 +190,60 @@ def remove(txn):

# We've got some duplicates
for i in res:
user_id, access_token, ip, device_id, user_agent, last_seen = i
user_id, access_token, ip, device_id, user_agent, last_seen, count = i

# We want to delete the duplicates so we end up with only a
# single row.
#
# The naive way of doing this would be just to delete all rows
# and reinsert a constructed row. However, if there are a lot of
# duplicate rows this can cause the table to grow a lot, which
# can be problematic in two ways:
# 1. If user_ips is already large then this can cause the
# table to rapidly grow, potentially filling the disk.
# 2. Reinserting a lot of rows can confuse the table
# statistics for postgres, causing it to not use the
# correct indices for the query above, resulting in a full
# table scan. This is incredibly slow for large tables and
# can kill database performance. (This seems to mainly
# happen for the last query where the clause is simply `? <
# last_seen`)
#
# So instead we want to delete all but *one* of the duplicate
# rows. That is hard to do reliably, so we cheat and do a two
# step process:
# 1. Delete all rows with a last_seen strictly less than the
# max last_seen. This hopefully results in deleting all but
# one row the majority of the time, but there may be
# duplicate last_seen
# 2. If multiple rows remain, we fall back to the naive method
# and simply delete all rows and reinsert.
#
# Note that this relies on no new duplicate rows being inserted,
# but if that is happening then this entire process is futile
# anyway.

# Do step 1:

txn.execute(
"""
DELETE FROM user_ips
WHERE user_id = ? AND access_token = ? AND ip = ? AND last_seen < ?
""",
(user_id, access_token, ip, last_seen)
)
if txn.rowcount == count - 1:
# We deleted all but one of the duplicate rows, i.e. there
# is exactly one remaining and so there is nothing left to
# do.
continue
elif txn.rowcount >= count:
raise Exception(
"We deleted more duplicate rows from 'user_ips' than expected",
)

# The previous step didn't delete enough rows, so we fallback to
# step 2:

# Drop all the duplicates
txn.execute(
Expand Down