Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
3b9b559
lots of stuff
egeakman May 24, 2024
a3db579
port funcs to 2024
egeakman May 25, 2024
0c50a62
update
egeakman May 25, 2024
1d106e0
oops + more readable + tell what event are we transforming
egeakman May 25, 2024
96111ab
better slug dupe check + optimize
egeakman May 25, 2024
08bcbde
add documentation
egeakman May 29, 2024
39a96e3
Update README.md
egeakman May 29, 2024
ecb1cc3
Update README.md
egeakman May 29, 2024
4276fa5
add configuration to readme
egeakman May 29, 2024
aba49d6
Use model_dump_json to be able to serialize datetime
egeakman May 29, 2024
4a0d477
Merge branch 'main' into port-to-2024
egeakman May 31, 2024
4e433ec
.env + documentation + extract more socials
egeakman May 31, 2024
fcceb66
exist_ok
egeakman Jun 1, 2024
b666971
url extraction functions
egeakman Jun 1, 2024
5798b4b
Tried to put timings under a different model
egeakman Jun 2, 2024
7818471
correct typing at some places
egeakman Jun 2, 2024
84d3387
better overall structure
egeakman Jun 2, 2024
339ba50
typing
egeakman Jun 2, 2024
df0ad5f
Add resources to the schema
egeakman Jun 2, 2024
f5e635f
Update README.md
egeakman Jun 2, 2024
66fa79f
oops missed this one
egeakman Jun 2, 2024
ee3f018
change gitx_url to gitx
egeakman Jun 2, 2024
96eb614
Add tests for mastodon and linkedin url extraction
NMertsch Jun 3, 2024
1dec5c8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 3, 2024
ce1de63
better code structure
egeakman Jun 4, 2024
de3f67d
Separate files
egeakman Jun 4, 2024
d875052
naming
egeakman Jun 4, 2024
42aba10
speaker website_url
egeakman Jun 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,11 @@ download:
python -m src.download

transform:
ifeq ($(ALLOW_DUPES), true)
python -m src.transform --allow-dupes
else
python -m src.transform

endif

all: download transform

Expand Down
10 changes: 6 additions & 4 deletions data/examples/output/sessions.json
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
"end": null,
"talks_in_parallel": null,
"talks_after": null,
"next_talk_code": null,
"prev_talk_code": null,
"talks_before": null,
"next_talk": null,
"prev_talk": null,
"website_url": "https://ep2024.europython.eu/session/this-is-a-test-talk-from-a-test-speaker-about-a-test-topic"
},
"B8CD4F": {
Expand All @@ -43,8 +44,9 @@
"end": null,
"talks_in_parallel": null,
"talks_after": null,
"next_talk_code": null,
"prev_talk_code": null,
"talks_before": null,
"next_talk": null,
"prev_talk": null,
"website_url": "https://ep2024.europython.eu/session/a-talk-with-shorter-title"
}
}
File renamed without changes.
File renamed without changes.
3 changes: 3 additions & 0 deletions src/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
"speakers?questions=all",
]

if not Config.raw_path.exists():
Config.raw_path.mkdir(parents=True)

for resource in resources:
url = base_url + f"{resource}"

Expand Down
205 changes: 185 additions & 20 deletions src/transform.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from __future__ import annotations

import json
import sys
from datetime import datetime

from pydantic import BaseModel, Field, model_validator
Expand Down Expand Up @@ -30,9 +33,9 @@ class SubmissionState:
class PretalxAnswer(BaseModel):
question_text: str
answer_text: str
answer_file: str | None
submission_id: str | None
speaker_id: str | None
answer_file: str | None = None
submission_id: str | None = None
speaker_id: str | None = None

@model_validator(mode="before")
@classmethod
Expand All @@ -48,8 +51,8 @@ def extract(cls, values):
class PretalxSpeaker(BaseModel):
code: str
name: str
biography: str | None
avatar: str | None
biography: str | None = None
avatar: str | None = None
slug: str
answers: list[PretalxAnswer] = Field(..., exclude=True)
submissions: list[str]
Expand Down Expand Up @@ -93,7 +96,7 @@ class PretalxSubmission(BaseModel):
speakers: list[str] # We only want the code, not the full info
submission_type: str
slug: str
track: str | None
track: str | None = None
state: str
abstract: str
answers: list[PretalxAnswer] = Field(..., exclude=True)
Expand All @@ -105,14 +108,16 @@ class PretalxSubmission(BaseModel):

# This is embedding a slot inside a submission for easier lookup later
room: str | None = None
start: datetime | None = None
end: datetime | None = None
start: datetime | str | None = None
end: datetime | str | None = None

# TODO: once we have schedule data then we can prefill those in the code here
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To me this is mixing too many concerns:

  1. Representation of individual submissions
  2. Temporal relationship between submissions

I would prefer to store the relationships (parallel, before, after, next, previous) in a separate data structure which references these PretalxSubmission objects.
Then we wouldn't have this situation where creating a "complete" PretalxSubmission object requires to steps:

submission = PretalxSubmission(...)  # call `__init__`, which only initialized a part of the object
submission.set_talks_in_parallel(...)  # initialize another part of the object
submission.set_talks_after(...)  # initialize yet another part of the object
[...]

Field like talks_in_parallel: list[str] | None = None can easily cause to accidents: my_submission.talks_in_parallel sometimes contains the "talks in parallel to my submission", depending on when I access the field.

That would be a non-trivial change, and the current implementation seems to work, so I'm fine with merging it as it is. But let's try to not walk this path much further, else it might get messy 🙂

# These are added after the model is created
talks_in_parallel: list[str] | None = None
talks_after: list[str] | None = None
next_talk_code: str | None = None
prev_talk_code: str | None = None
talks_before: list[str] | None = None
next_talk: str | None = None
prev_talk: str | None = None

website_url: str | None = None

Expand Down Expand Up @@ -153,9 +158,21 @@ def extract(cls, values):
if isinstance(values["duration"], int):
values["duration"] = str(values["duration"])

if cls.is_publishable and values["slot"]:
slot = values["slot"]

if isinstance(slot["room"], dict):
values["room"] = slot["room"]["en"]

if slot["start"]:
values["start"] = datetime.fromisoformat(slot["start"])
values["end"] = datetime.fromisoformat(slot["end"])

slug = slugify(values["title"])
values["slug"] = slug
values["website_url"] = f"https://ep2024.europython.eu/session/{slug}"
values["website_url"] = (
f"https://ep{Config.event.split('-')[1]}.europython.eu/session/{slug}"
)

return values

Expand All @@ -171,6 +188,114 @@ def is_confirmed(self):
def is_publishable(self):
return self.is_accepted or self.is_confirmed

@staticmethod
def set_talks_in_parallel(
submission: PretalxSubmission, all_sessions: dict[str, PretalxSubmission]
):
parallel = []
for session in all_sessions.values():
if (
session.code == submission.code
or session.start is None
or submission.start is None
):
continue

# If they intersect, they are in parallel
if session.start < submission.end and session.end > submission.start:
parallel.append(session.code)

submission.talks_in_parallel = parallel

@staticmethod
def set_talks_after(
submission: PretalxSubmission, all_sessions: dict[str, PretalxSubmission]
):
# Sort sessions based on start time, early first
all_sessions_sorted = sorted(
all_sessions.values(), key=lambda x: (x.start is None, x.start)
)

# Filter out sessions
remaining_sessions = [
session
for session in all_sessions_sorted
if session.start is not None
and session.start >= submission.end
and session.code not in submission.talks_in_parallel
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can change these two lines to include intersecting talks like these:

image

In the current version, PYO panel would not show How Python can help monitor governments as a talk after.

We can change it to do so.

and session.code != submission.code
and submission.start.day == session.start.day
and not submission.submission_type
== session.submission_type
== "Announcements"
]

# Add sessions to the list if they are in different rooms
seen_rooms = set()
unique_sessions = []

for session in remaining_sessions:
if session.room not in seen_rooms:
unique_sessions.append(session)
seen_rooms.add(session.room)

# If there is a keynote next, only show that
if any(s.submission_type == "Keynote" for s in unique_sessions):
unique_sessions = [
s for s in unique_sessions if s.submission_type == "Keynote"
]

# Set the next talks in all rooms
submission.talks_after = [session.code for session in unique_sessions]

# Set the next talk in the same room
for session in unique_sessions:
if session.room == submission.room:
submission.next_talk = session.code
break

@staticmethod
def set_talks_before(
submission: PretalxSubmission, all_sessions: dict[str, PretalxSubmission]
):
# Sort sessions based on start time, late first
all_sessions_sorted = sorted(
all_sessions.values(),
key=lambda x: (x.start is None, x.start),
reverse=True,
)

remaining_sessions = [
session
for session in all_sessions_sorted
if session.start is not None
and session.code not in submission.talks_in_parallel
and session.start <= submission.start
and session.code != submission.code
and submission.start.day == session.start.day
and session.submission_type != "Announcements"
]

seen_rooms = set()
unique_sessions = []

for session in remaining_sessions:
if session.room not in seen_rooms:
unique_sessions.append(session)
seen_rooms.add(session.room)

submission.talks_before = [session.code for session in unique_sessions]

for session in unique_sessions:
if session.room == submission.room:
submission.prev_talk = session.code
break

def model_dump(self):
self.start = self.start.isoformat() if self.start else None
self.end = self.end.isoformat() if self.end else None
return super().model_dump()


def parse_submissions() -> list[PretalxSubmission]:
"""
Expand Down Expand Up @@ -209,33 +334,73 @@ def publishable_speakers(accepted_proposals: set[str]) -> dict[str, PretalxSpeak
return output


def save_publishable_sessions():
def save_publishable_sessions(publishable: dict[str, PretalxSubmission]):
path = Config.public_path / "sessions.json"

publishable = publishable_submissions()
for sub in publishable.values():
if sub.start is not None:
PretalxSubmission.set_talks_in_parallel(sub, publishable)
PretalxSubmission.set_talks_after(sub, publishable)
PretalxSubmission.set_talks_before(sub, publishable)

data = {k: v.model_dump() for k, v in publishable.items()}
with open(path, "w") as fd:
json.dump(data, fd, indent=2)


def save_publishable_speakers():
def save_publishable_speakers(publishable: dict[str, PretalxSubmission]):
path = Config.public_path / "speakers.json"

publishable = publishable_submissions()
speakers = publishable_speakers(publishable.keys())

data = {k: v.model_dump() for k, v in speakers.items()}
with open(path, "w") as fd:
json.dump(data, fd, indent=2)


def save_all(all_sessions: dict[str, PretalxSubmission]):
if not Config.public_path.exists():
Config.public_path.mkdir(parents=True)

save_publishable_sessions(all_sessions)
save_publishable_speakers(all_sessions)


def check_duplicate_slugs(all_sessions: dict[str, PretalxSubmission]) -> bool:
all_speakers = publishable_speakers(all_sessions.keys())

session_slugs = [s.slug for s in all_sessions.values()]
speaker_slugs = [s.slug for s in all_speakers.values()]

session_duplicates = [
slug for slug in set(session_slugs) if session_slugs.count(slug) > 1
]
speaker_duplicates = [
slug for slug in set(speaker_slugs) if speaker_slugs.count(slug) > 1
]

if session_duplicates or speaker_duplicates:
print("Found duplicate slugs:")
for slug in session_duplicates:
print(f"Session: {slug}")
for slug in speaker_duplicates:
print(f"Speaker: {slug}")
return False
return True


if __name__ == "__main__":
print(f"Transforming {Config.event} data...")
print("Checking for duplicate slugs...")
assert len(set(s.slug for s in publishable_submissions().values())) == len(
publishable_submissions()
)

all_sessions = publishable_submissions()

if not check_duplicate_slugs(all_sessions) and (
len(sys.argv) <= 1 or sys.argv[1] != "--allow-dupes"
):
print("Exiting. Use ``make transform ALLOW_DUPES=true`` to continue.")
sys.exit(1)

print("Saving publishable data...")
save_publishable_sessions()
save_publishable_speakers()
save_all(all_sessions)
print("Done")