Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions ctd/read.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,12 @@ def _open_compressed(fname):


def _read_file(fname):
"""Read file contents."""
"""Read file contents, or read from StringIO object."""
if isinstance(fname, StringIO):
fname.seek(0)
text = fname.read()
return StringIO(text)

if not isinstance(fname, Path):
fname = Path(fname).resolve()

Expand Down Expand Up @@ -102,6 +107,7 @@ def _parse_seabird(lines, ftype):
"""Parse searbird formats."""
# Initialize variables.
lon = lat = time = None, None, None
fname = None
skiprows = 0

metadata = {}
Expand All @@ -119,6 +125,9 @@ def _parse_seabird(lines, ftype):
# Seabird headers starts with *.
if line.startswith("*"):
header.append(line)
if "FileName" in line:
file_path = line.split("=")[-1].strip()
fname = Path(file_path).stem

# Seabird configuration starts with #.
if line.startswith("#"):
Expand Down Expand Up @@ -172,6 +181,7 @@ def _parse_seabird(lines, ftype):
names.append("Statistic")
metadata.update(
{
"name": fname if fname else "unknown",
"header": "\n".join(header),
"config": "\n".join(config),
"names": _remane_duplicate_columns(names),
Expand Down Expand Up @@ -261,7 +271,9 @@ def from_btl(fname):

df["Statistic"] = df["Statistic"].str.replace(r"\(|\)", "") # (avg) to avg

name = _basename(fname)[1]
if "name" not in metadata:
name = _basename(fname)[1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This Forces folks using stream to always pass a name but that is optional, so the API here is kind of broken. (At least lacking documentation.)

Ideally we should check if it is stream and hardcode a default name for those cases.

For that to work we should move this check to _read_file and make it output both f and name there. That way we check for StringIO only once.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hope I'm not over complicating this idea, but did you want _read_file to return a tuple of values, the first being the StringIO object and the second being the file/stream name?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. _read_file is an internal use function, so it is OK to "break it" and better than changing the call on from_btl. Does that make sense?

Copy link
Contributor Author

@upsonp upsonp May 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's an idea.

What if when reading the file, we grab the BTL "File Name" in the header section and use that to set the file name?

* Sea-Bird SBE 9 Data File:
* FileName = C:\CTD_ACQUISITION\2021185HUD\ctddata\185A007.hdr
* Software Version Seasave V 7.26.7.121
* Temperature SN = 5083
* Conductivity SN = 3562
 ...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to check if that metadata exists and, if not, fill with a placeholder. But sure, that is probably the best way to go.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, what I mean is what if we got the file name in the _parse_seabird() method, if a name hasn't been set already.

metadata["name"] = str(name)

dtypes = {
"bpos": int,
Expand All @@ -282,7 +294,6 @@ def from_btl(fname):
warnings.warn("Could not convert %s to float." % column)

df["Date"] = pd.to_datetime(df["Date"])
metadata["name"] = str(name)
setattr(df, "_metadata", metadata)
return df

Expand Down
17 changes: 11 additions & 6 deletions tests/test_read.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@ def btl():


@pytest.fixture
def btl_duplicate_header_name():
yield ctd.from_btl(data_path.joinpath("btl", "alt_bottletest.BTL"))
def btl_as_stream():
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not need to be a fixture b/c we won't be calling it multiple times in tests.
You can remove the @pytest.fixture and rename it to test_btl_as_stream so pytest can execute it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I'm not use to unittesting specifically with pytest I was just emulating what other tests had done, I'll fix it up.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't worry and no need to be sorry. The only reason I'm not sending commit directly to your branch is b/c I believe you want to do this. If not, I can take over. Your contributions and very welcomed and I'm trying to pay back by showing a bit how my messy tests work.

file = open(mode="rb", file=data_path.joinpath("btl", "alt_bottletest.BTL"))
stream = StringIO(file.read().decode("cp1252"))
yield ctd.from_btl(stream)


@pytest.fixture
Expand Down Expand Up @@ -83,10 +85,13 @@ def test_btl_is_dataframe(btl):
assert not btl.empty


def test_btl_with_dup_cols(btl_duplicate_header_name):
assert all(
col in btl_duplicate_header_name.columns for col in ["Bottle", "Bottle_"]
)
def test_btl_with_dup_cols(btl_as_stream):
assert all(col in btl_as_stream.columns for col in ["Bottle", "Bottle_"])


def test_btl_as_stringio(btl_as_stream):
assert isinstance(btl_as_stream, pd.DataFrame)
assert not btl_as_stream.empty


def test_ros_is_dataframe(ros):
Expand Down