import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display
import os
[docs]
def assign_chain_length(output_location=None, rt_min=0, rt_max=2500, chain_lengths = ['C16', 'C18', 'C20', 'C22', 'C24', 'C26', 'C28', 'C30', 'C32']):
"""
Interactively assign chain-length labels to compounds based on retention time.
This function allows the user to define retention-time (RT) windows
corresponding to specific chain-length compounds (e.g., C16–C32).
The assigned chain labels are written to a configuration file
and used downstream in the ``iso_process`` correction workflow.
The function is typically used during initial setup or when
chromatographic conditions change and retention times must be
re-mapped.
Parameters
----------
output_location : str or pathlib.Path, optional
Path to the processed output file containing retention time
information. If provided, the file is loaded and used to
visualize compound RT positions for interactive selection.
If None, the function may prompt the user to select a file.
rt_min : float, default=0
Minimum retention time (in seconds) considered when assigning
chain-length windows.
rt_max : float, default=2500
Maximum retention time (in seconds) considered when assigning
chain-length windows.
chain_lengths : list of str, default=['C16', 'C18', ..., 'C32']
List of chain identifiers to be assigned. Each entry represents
a compound whose RT window will be defined interactively.
Custom chain sets may be provided if needed.
Returns
-------
None
This function does not return a value. Instead, it updates
the chain configuration file (e.g., ``chains.json``) stored
within the package directory.
Functionality
-------------
- Loads peak retention time data.
- Displays chromatographic information for user inspection.
- Allows interactive selection of RT windows for each chain length.
- Saves updated chain definitions to configuration.
- These definitions are used by ``iso_process`` for compound matching.
Notes
-----
- Accurate retention-time assignment is critical for correct
standard identification and downstream corrections.
- If chromatographic conditions shift (e.g., column change,
temperature program modification), chain windows should be
redefined.
- The saved configuration affects all subsequent processing runs.
"""
csv_path = input("Provide file location: ").strip()
while csv_path.startswith(("'", '"')) and csv_path.endswith(("'", '"')):
csv_path = csv_path[1:-1]
csv_path = csv_path.strip()
if not os.path.isfile(csv_path):
print(f"File not found: {csv_path}")
return
df = pd.read_csv(csv_path)
df = df.loc[(df['Rt'] > rt_min) & (df['Rt'] < rt_max)].copy()
chain_pattern = re.compile(r'[CN]\d+[A-Za-z0-9]*')
def extract_chains(identifier):
matches = chain_pattern.findall(str(identifier))
if len(matches) >= 2:
return tuple(sorted(matches[:2])) # Sort lexicographically (strings)
else:
return None
df['Chains'] = df['Identifier 1'].apply(extract_chains)
from matplotlib import colormaps
cmap = colormaps['tab10']
standard_chains = df['Chains'].dropna().unique()
chain_to_color = {tuple(ch): cmap(i) for i, ch in enumerate(standard_chains)}
def assign_color(chains):
if chains in chain_to_color:
return chain_to_color[chains]
else:
return 'gray'
df['Color'] = df['Chains'].apply(assign_color)
legend_labels = {}
# for ch, color in chain_to_color.items():
# label = f'Standard C{ch[0]} & C{ch[1]}'
# legend_labels[label] = color
for ch, color in chain_to_color.items():
# ch is now tuple of strings like ('C20N2', 'C28')
label = f'Standard {ch[0]} & {ch[1]}'
legend_labels[label] = color
legend_labels['Unknowns'] = 'gray'
if 'Component' not in df.columns:
df['Component'] = ''
# chain_lengths = ['C16', 'C18', 'C20', 'C22', 'C24', 'C26', 'C28', 'C30', 'C32']
chain_index = 0
fig, ax = plt.subplots(figsize=(10, 6))
for label, color in legend_labels.items():
if label == 'Unknowns':
subset = df[df['Color'] == color]
zoro = 0
alphy = 0.4
else:
parts = label.split()
# parts example: ['Standard', 'C20N2', '&', 'C28']
chain_1 = parts[1]
chain_2 = parts[3]
subset = df[df['Chains'].apply(lambda x: x == (chain_1, chain_2))]
zoro = 1
alphy = 0.65
if not subset.empty:
ax.scatter(subset['Rt'], subset['Area All'], color=color, label=label,
alpha=alphy, edgecolor='k', s=50, zorder=zoro)
ax.set_xlabel('Retention Time (Rt)')
ax.set_ylabel('Area All')
ax.legend(loc='upper right')
out = widgets.Output()
display(out)
vertical_lines = []
text_labels = []
assigned_masks = [] # Keep track of masks for undo
def on_click(event):
nonlocal chain_index
if event.inaxes != ax:
return
if chain_index >= len(chain_lengths):
with out:
out.clear_output()
print("All chain lengths assigned. You may close the plot.")
return
rt_clicked = event.xdata
current_chain = chain_lengths[chain_index]
vline = ax.axvline(rt_clicked, color='red', linestyle='--')
vertical_lines.append(vline)
txt = ax.text(rt_clicked, ax.get_ylim()[1]*0.95, current_chain,
color='red', fontsize=12, rotation=90,
verticalalignment='top', horizontalalignment='right')
text_labels.append(txt)
fig.canvas.draw_idle()
mask = (df['Rt'] >= rt_clicked - 5) & (df['Rt'] <= rt_clicked + 5)
df.loc[mask, 'Component'] = current_chain
assigned_masks.append(mask)
with out:
out.clear_output()
print(f"Assigned {current_chain} to {mask.sum()} rows near Rt={rt_clicked:.2f} s")
if chain_index + 1 < len(chain_lengths):
print(f"Please click the location for {chain_lengths[chain_index + 1]}")
chain_index += 1
if chain_index == len(chain_lengths):
# Determine output path
base_name = os.path.basename(csv_path)
base, ext = os.path.splitext(base_name)
if output_location is None:
new_path = f"{os.path.splitext(csv_path)[0]}_chainID{ext}"
else:
# Make sure output_location exists
os.makedirs(output_location, exist_ok=True)
new_path = os.path.join(output_location, f"{base}_chainID{ext}")
df.drop(columns=['Color'], inplace=True)
df.to_csv(new_path, index=False)
with out:
print(f"\nAll chain lengths assigned.\nSaved updated dataframe to:\n{new_path}")
def on_key(event):
nonlocal chain_index
if event.key in ['backspace', 'delete']:
if chain_index == 0:
with out:
out.clear_output()
print("No assignments to undo.")
return
# Undo last assignment
chain_index -= 1
# Remove vertical line and label
vertical_lines[-1].remove()
vertical_lines.pop()
text_labels[-1].remove()
text_labels.pop()
# Remove last mask assignment
last_mask = assigned_masks.pop()
df.loc[last_mask, 'Component'] = ''
fig.canvas.draw_idle()
with out:
out.clear_output()
print(f"Undo last assignment. Now please click the location for {chain_lengths[chain_index]}")
fig.canvas.mpl_connect('button_press_event', on_click)
fig.canvas.mpl_connect('key_press_event', on_key)
with out:
print(f"Please click the location for {chain_lengths[0]}")
plt.show()