import hashlib
import urllib
from datetime import date as dtdate
from datetime import timedelta
from pathlib import Path
import pandas as pd
from yarl import URL
import hvplot.pandas # noqa: F401
from pyuvis import CatalogFilter
from planetarypy.pds import (
get_index,
print_available_indexes, # replaces removed find_indexes()
)
from planetarypy import utilsPDS catalog
Duplicates
The following are duplicates in the delivered data to PDS, always in adjacent delivery volumes COUVIS_xxxx:
df = df.sort_values(by="FILE_NAME")df[df.duplicated(subset="product_id", keep=False)].sort_values(by="product_id")I ran tests to confirm that the md5sum hashs are the same for all duplicates and that the label files only ever are different in the given data production date.
After confirming with Mark and team at RMS, we will remove the duplicates from the PDS index files, however the delivered volumes will remain unchanged.
df = df.drop_duplicates(
subset="product_id", keep="last"
) # keep last works as I sorted by `FILE_NAME` beforebase_url = URL("https://opus.pds-rings.seti.org/holdings/volumes/COUVIS_0xxx/")duplicates = df[df.duplicated(subset="product_id", keep=False)]grouped = duplicates.groupby(["product_id", "FILE_NAME"]).size()grouped.head()import hashlib
import urllib
def get_remote_md5_sum(url, max_file_size=100 * 1024 * 1024):
remote = urllib.request.urlopen(url)
hash = hashlib.md5()
total_read = 0
while True:
data = remote.read(4096)
total_read += 4096
if not data or total_read > max_file_size:
break
hash.update(data)
return hash.hexdigest()
def read_remote_label(url):
remote = urllib.request.urlopen(url)
total_read = 0
while True:
data = remote.read(4096)
total_read += 4096
if not data or total_read > max_file_size:
break
hash.update(data)
return hash.hexdigest()check for hash equality of duplicate files
for pid in grouped.index.levels[0]:
fname1, fname2 = grouped[pid].index
url1 = base_url / str(Path(fname1[1:]).with_suffix(".DAT"))
url2 = base_url / str(Path(fname2[1:]).with_suffix(".DAT"))
hash1 = get_remote_md5_sum(str(url1))
hash2 = get_remote_md5_sum(str(url2))
if hash1 == hash2:
print(pid, "OK")
else:
print(pid, "is different.")Check for content equality of label files
for pid in grouped.index.levels[0]:
print(pid)
fname1, fname2 = grouped[pid].index
url1 = base_url / fname1[1:]
url2 = base_url / fname2[1:]
with urllib.request.urlopen(str(url1)) as response:
data1 = response.read().decode()
with urllib.request.urlopen(str(url2)) as response:
data2 = response.read().decode()
for line1, line2 in zip(data1.splitlines(), data2.splitlines()):
if line1 == line2:
continue
else:
print(line1, "\n", line2)df.set_index("product_id", inplace=True)filter = df.index.map(lambda x: x[:3] in ["EUV", "FUV"])pds_euv_fuv = df[filter]pds_euv_fuv.head()pds_euv_fuv.query("index=='EUV2002_198_04_31'")pds_euv_fuv[pds_euv_fuv.index.str.contains("EUV2002_198")]spica.groupby("DATE").size()spica.query("DATE=='2001-04-03'")fname = "EUV2001_093_08_35_28"s = pds_euv_fuv.indexs[s.str.startswith("EUV2001_093")]df.head()