Source code for stangene.classify
"""Feature type classification for gene vs non-gene triage."""
import pandas as pd
from stangene._logging import get_logger
from stangene.species import CLASSIFICATION_PATTERNS
logger = get_logger("classify")
# Map 10x Cell Ranger feature_type labels to our internal types
_CELLRANGER_TYPE_MAP = {
"Gene Expression": "gene",
"Antibody Capture": "antibody_capture",
"CRISPR Guide Capture": "crispr_guide",
"Custom": "custom",
"Peaks": "peak",
}
_NON_GENE_TYPES = frozenset([
"transcript", "antibody_capture", "crispr_guide",
"spike_in", "peak", "custom",
])
[docs]
def classify_features(ft: pd.DataFrame) -> pd.DataFrame:
"""Classify features as gene or non-gene types.
Adds/updates 'original_feature_type' and sets 'mapping_status' to
'non_gene_feature' for non-gene rows. Returns a copy.
"""
result = ft.copy()
# Ensure columns exist
if "original_feature_type" not in result.columns:
result["original_feature_type"] = None
if "mapping_status" not in result.columns:
result["mapping_status"] = None
if "mapping_notes" not in result.columns:
result["mapping_notes"] = None
for idx in result.index:
existing_type = result.at[idx, "original_feature_type"]
if pd.notna(existing_type) and existing_type in _CELLRANGER_TYPE_MAP:
result.at[idx, "original_feature_type"] = _CELLRANGER_TYPE_MAP[existing_type]
elif pd.notna(existing_type) and existing_type in _NON_GENE_TYPES | {"gene"}:
pass
else:
name = result.at[idx, "original_feature_name"]
matched = False
for pattern, ftype in CLASSIFICATION_PATTERNS:
if pattern.match(str(name)):
result.at[idx, "original_feature_type"] = ftype
matched = True
break
if not matched:
result.at[idx, "original_feature_type"] = "gene"
result.at[idx, "mapping_notes"] = "classified as gene by default (no pattern match)"
if result.at[idx, "original_feature_type"] in _NON_GENE_TYPES:
result.at[idx, "mapping_status"] = "non_gene_feature"
classified_counts = result["original_feature_type"].value_counts().to_dict()
logger.info("Feature classification: %s", classified_counts)
return result