PDS catalog

PDS related stuff for pyuvis

Like index files etc.

These are provided as stored on PDS by planetarypy but we do helpful filtering in here on top of those catalogs.

⚠️ Demo cells below are marked #| eval: false — they require live PDS index fetches, cached data products, or interactive plotting backends that are unavailable during a clean docs render. Open this notebook in JupyterLab to step through it cell-by-cell against your local data.

import hashlib
import urllib
from datetime import date as dtdate
from datetime import timedelta
from pathlib import Path

import pandas as pd
from yarl import URL

import hvplot.pandas  # noqa: F401

from pyuvis import CatalogFilter
from planetarypy.pds import (
    get_index,
    print_available_indexes,  # replaces removed find_indexes()
)
from planetarypy import utils

date = "2002-198"

cat = CatalogFilter(date)
cat.date

cat.doydate

cat.isodate

cat.df.groupby(["OBSERVATION_TYPE", "TARGET_NAME"]).size().index

cat.uv.tail()

cat.ustare_stars.index

cat.date

cat.get_ustare_stars_date().index

cat.uv.head()

df.columns

get_index("cassini.uvis", "ring_summary").head()

df.head()

[col for col in df.columns if "TIME" in col.upper()]

df["product_id"] = df.FILE_NAME.map(lambda x: Path(x).stem)
df["volume"] = df.FILE_NAME.map(lambda x: x.split("/")[1])

df.columns

new_cols = ["product_id", "volume"] + list(df.columns)[:-2]

df = df[new_cols]

df.head()

Duplicates

The following are duplicates in the delivered data to PDS, always in adjacent delivery volumes COUVIS_xxxx:

df = df.sort_values(by="FILE_NAME")

df[df.duplicated(subset="product_id", keep=False)].sort_values(by="product_id")

I ran tests to confirm that the md5sum hashs are the same for all duplicates and that the label files only ever are different in the given data production date.

After confirming with Mark and team at RMS, we will remove the duplicates from the PDS index files, however the delivered volumes will remain unchanged.

df = df.drop_duplicates(
    subset="product_id", keep="last"
)  # keep last works as I sorted by `FILE_NAME` before

base_url = URL("https://opus.pds-rings.seti.org/holdings/volumes/COUVIS_0xxx/")

duplicates = df[df.duplicated(subset="product_id", keep=False)]

grouped = duplicates.groupby(["product_id", "FILE_NAME"]).size()

grouped.head()

import hashlib
import urllib


def get_remote_md5_sum(url, max_file_size=100 * 1024 * 1024):
    remote = urllib.request.urlopen(url)
    hash = hashlib.md5()

    total_read = 0
    while True:
        data = remote.read(4096)
        total_read += 4096

        if not data or total_read > max_file_size:
            break

        hash.update(data)

    return hash.hexdigest()


def read_remote_label(url):
    remote = urllib.request.urlopen(url)

    total_read = 0
    while True:
        data = remote.read(4096)
        total_read += 4096

        if not data or total_read > max_file_size:
            break

        hash.update(data)

    return hash.hexdigest()

check for hash equality of duplicate files

for pid in grouped.index.levels[0]:
    fname1, fname2 = grouped[pid].index
    url1 = base_url / str(Path(fname1[1:]).with_suffix(".DAT"))
    url2 = base_url / str(Path(fname2[1:]).with_suffix(".DAT"))
    hash1 = get_remote_md5_sum(str(url1))
    hash2 = get_remote_md5_sum(str(url2))
    if hash1 == hash2:
        print(pid, "OK")
    else:
        print(pid, "is different.")

Check for content equality of label files

for pid in grouped.index.levels[0]:
    print(pid)
    fname1, fname2 = grouped[pid].index
    url1 = base_url / fname1[1:]
    url2 = base_url / fname2[1:]
    with urllib.request.urlopen(str(url1)) as response:
        data1 = response.read().decode()
    with urllib.request.urlopen(str(url2)) as response:
        data2 = response.read().decode()
    for line1, line2 in zip(data1.splitlines(), data2.splitlines()):
        if line1 == line2:
            continue
        else:
            print(line1, "\n", line2)

df.set_index("product_id", inplace=True)

filter = df.index.map(lambda x: x[:3] in ["EUV", "FUV"])

pds_euv_fuv = df[filter]

pds_euv_fuv.head()

pds_euv_fuv.query("index=='EUV2002_198_04_31'")

pds_euv_fuv[pds_euv_fuv.index.str.contains("EUV2002_198")]

spica.groupby("DATE").size()

spica.query("DATE=='2001-04-03'")

fname = "EUV2001_093_08_35_28"

s = pds_euv_fuv.index

s[s.str.startswith("EUV2001_093")]

df.head()