python
Utilities
CSV Data Processor
Flexible CSV processor with support for filtering, transforming, aggregating, and exporting data using pandas.
Apex Logic
0 copies
python
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional, Callable
class CSVProcessor:
def __init__(self, filepath: str, encoding: str = "utf-8"):
self.filepath = Path(filepath)
self.df = pd.read_csv(filepath, encoding=encoding)
self.original_shape = self.df.shape
print(f"Loaded {self.original_shape[0]} rows, {self.original_shape[1]} columns")
def filter_rows(self, column: str, condition: Callable) -> "CSVProcessor":
self.df = self.df[self.df[column].apply(condition)]
return self
def rename_columns(self, mapping: Dict[str, str]) -> "CSVProcessor":
self.df = self.df.rename(columns=mapping)
return self
def drop_duplicates(self, subset: Optional[List[str]] = None) -> "CSVProcessor":
before = len(self.df)
self.df = self.df.drop_duplicates(subset=subset)
print(f"Removed {before - len(self.df)} duplicates")
return self
def fill_missing(self, column: str, value) -> "CSVProcessor":
self.df[column] = self.df[column].fillna(value)
return self
def aggregate(self, group_by: str, agg_dict: Dict[str, str]) -> pd.DataFrame:
return self.df.groupby(group_by).agg(agg_dict).reset_index()
def export(self, output: str, index: bool = False) -> None:
self.df.to_csv(output, index=index)
print(f"Exported {len(self.df)} rows to {output}")
def summary(self) -> None:
print(f"\nShape: {self.df.shape}")
print(f"Columns: {list(self.df.columns)}")
print(f"Missing values:\n{self.df.isnull().sum()}")
if __name__ == "__main__":
processor = CSVProcessor("data/sales.csv")
processor.drop_duplicates(["order_id"]) \
.filter_rows("amount", lambda x: x > 0) \
.fill_missing("region", "Unknown") \
.export("data/cleaned_sales.csv")