PDS catalog

Duplicates

The following are duplicates in the delivered data to PDS, always in adjacent delivery volumes COUVIS_xxxx:

df = df.sort_values(by="FILE_NAME")
df[df.duplicated(subset="product_id", keep=False)].sort_values(by="product_id")

I ran tests to confirm that the md5sum hashs are the same for all duplicates and that the label files only ever are different in the given data production date.

After confirming with Mark and team at RMS, we will remove the duplicates from the PDS index files, however the delivered volumes will remain unchanged.

df = df.drop_duplicates(
    subset="product_id", keep="last"
)  # keep last works as I sorted by `FILE_NAME` before
base_url = URL("https://opus.pds-rings.seti.org/holdings/volumes/COUVIS_0xxx/")
duplicates = df[df.duplicated(subset="product_id", keep=False)]
grouped = duplicates.groupby(["product_id", "FILE_NAME"]).size()
grouped.head()
import hashlib
import urllib


def get_remote_md5_sum(url, max_file_size=100 * 1024 * 1024):
    remote = urllib.request.urlopen(url)
    hash = hashlib.md5()

    total_read = 0
    while True:
        data = remote.read(4096)
        total_read += 4096

        if not data or total_read > max_file_size:
            break

        hash.update(data)

    return hash.hexdigest()


def read_remote_label(url):
    remote = urllib.request.urlopen(url)

    total_read = 0
    while True:
        data = remote.read(4096)
        total_read += 4096

        if not data or total_read > max_file_size:
            break

        hash.update(data)

    return hash.hexdigest()

check for hash equality of duplicate files

for pid in grouped.index.levels[0]:
    fname1, fname2 = grouped[pid].index
    url1 = base_url / str(Path(fname1[1:]).with_suffix(".DAT"))
    url2 = base_url / str(Path(fname2[1:]).with_suffix(".DAT"))
    hash1 = get_remote_md5_sum(str(url1))
    hash2 = get_remote_md5_sum(str(url2))
    if hash1 == hash2:
        print(pid, "OK")
    else:
        print(pid, "is different.")

Check for content equality of label files

for pid in grouped.index.levels[0]:
    print(pid)
    fname1, fname2 = grouped[pid].index
    url1 = base_url / fname1[1:]
    url2 = base_url / fname2[1:]
    with urllib.request.urlopen(str(url1)) as response:
        data1 = response.read().decode()
    with urllib.request.urlopen(str(url2)) as response:
        data2 = response.read().decode()
    for line1, line2 in zip(data1.splitlines(), data2.splitlines()):
        if line1 == line2:
            continue
        else:
            print(line1, "\n", line2)
df.set_index("product_id", inplace=True)
filter = df.index.map(lambda x: x[:3] in ["EUV", "FUV"])
pds_euv_fuv = df[filter]
pds_euv_fuv.head()
pds_euv_fuv.query("index=='EUV2002_198_04_31'")
pds_euv_fuv[pds_euv_fuv.index.str.contains("EUV2002_198")]
spica.groupby("DATE").size()
spica.query("DATE=='2001-04-03'")
fname = "EUV2001_093_08_35_28"
s = pds_euv_fuv.index
s[s.str.startswith("EUV2001_093")]
df.head()