From 921e55cf60d29c09748b7a93eba7f78b28f3c1c7 Mon Sep 17 00:00:00 2001 From: Douglas Raillard Date: Wed, 18 Jun 2025 20:42:03 +0100 Subject: [PATCH] lisa: Fix pl.LazyFrame.rename() uses FIX The new column names must not already exist in the LazyFrame, otherwise a duplicated column error will arise later on. Fix that issue by dropping any column that we plan on creating with the rename operation prior to the rename. --- lisa/analysis/latency.py | 14 ++++++++++--- lisa/analysis/tasks.py | 44 +++++++++++++++++++++++++++------------- lisa/trace.py | 26 +++++++++++++++++------- 3 files changed, 60 insertions(+), 24 deletions(-) diff --git a/lisa/analysis/latency.py b/lisa/analysis/latency.py index 69a32f800..20296cf1b 100644 --- a/lisa/analysis/latency.py +++ b/lisa/analysis/latency.py @@ -56,7 +56,7 @@ class LatencyAnalysis(TraceAnalysisBase): (pl.col('next_state') == next_state) ) df = df.select(['Time', 'delta', 'cpu', 'target_cpu']) - df = df.rename({'delta': name}) + df = df.drop(name, strict=False).rename({'delta': name}) return df @TraceAnalysisBase.df_method @@ -290,11 +290,19 @@ class LatencyAnalysis(TraceAnalysisBase): if wakeup: wkp_df = ana.df_latency_wakeup(task) - wkp_df = wkp_df.rename({'wakeup_latency': 'latency'}) + wkp_df = ( + wkp_df + .drop('latency', strict=False) + .rename({'wakeup_latency': 'latency'}) + ) if preempt: prt_df = ana.df_latency_preemption(task) - prt_df = prt_df.rename({'preempt_latency': 'latency'}) + prt_df = ( + prt_df + .drop('latency', strict=False) + .rename({'preempt_latency': 'latency'}) + ) if wakeup and preempt: return pl.concat([wkp_df, prt_df], how='diagonal_relaxed') diff --git a/lisa/analysis/tasks.py b/lisa/analysis/tasks.py index f6481763f..6d6a11936 100644 --- a/lisa/analysis/tasks.py +++ b/lisa/analysis/tasks.py @@ -696,26 +696,38 @@ class TasksAnalysis(TraceAnalysisBase): prev_sw_df = sw_df.select(["Time", "__cpu", "prev_pid", "prev_state", "prev_comm"]) next_sw_df = sw_df.select(["Time", "__cpu", "next_pid", "next_comm"]) - prev_sw_df = prev_sw_df.rename({ - "prev_pid": "pid", - "prev_state": "curr_state", - "prev_comm": "comm", - }) + prev_sw_df = ( + prev_sw_df + .drop('pid', 'curr_state', 'comm', strict=False) + .rename({ + "prev_pid": "pid", + "prev_state": "curr_state", + "prev_comm": "comm", + }) + ) next_sw_df = next_sw_df.with_columns( curr_state=state(TaskState.TASK_ACTIVE) ) - next_sw_df = next_sw_df.rename({ - 'next_pid': 'pid', - 'next_comm': 'comm' - }) + next_sw_df = ( + next_sw_df + .drop('pid', 'comm', strict=False) + .rename({ + 'next_pid': 'pid', + 'next_comm': 'comm' + }) + ) all_sw_df = pl.concat([prev_sw_df, next_sw_df], how='diagonal_relaxed') if add_rename: - rename_df = trace.df_event('task_rename').rename({ - 'oldcomm': 'comm', - '__pid': 'pid', - }) + rename_df = ( + trace.df_event('task_rename') + .drop('pid', 'comm', strict=False) + .rename({ + 'oldcomm': 'comm', + '__pid': 'pid', + }) + ) rename_df = rename_df.select(['Time', 'pid', 'comm']) rename_df = rename_df.with_columns( curr_state=state(TaskState.TASK_RENAMED), @@ -729,7 +741,11 @@ class TasksAnalysis(TraceAnalysisBase): df = pl.concat([all_sw_df, wk_df], how='diagonal_relaxed') df = df.sort('Time') - df = df.rename({'__cpu': 'cpu'}) + df = ( + df + .drop('cpu', strict=False) + .rename({'__cpu': 'cpu'}) + ) # Restrict the set of data we will process to a given set of tasks if tasks is not None: diff --git a/lisa/trace.py b/lisa/trace.py index 4ce6211df..7e6cbb5a5 100644 --- a/lisa/trace.py +++ b/lisa/trace.py @@ -1082,11 +1082,15 @@ class TraceDumpTraceParser(TraceParserBase): ) def _fixup_df(self, event, df, pid_comms): - df = df.rename({ - 'common_ts': 'Time', - 'common_pid': '__pid', - 'common_cpu': '__cpu', - }) + df = ( + df + .drop('Time', '__pid', '__cpu', strict=False) + .rename({ + 'common_ts': 'Time', + 'common_pid': '__pid', + 'common_cpu': '__cpu', + }) + ) df = df.with_columns([ pl.col('Time').cast(pl.Duration("ns")), pl.col('__pid').replace_strict(pid_comms, default=None).alias('__comm') @@ -1998,7 +2002,11 @@ class TxtTraceParserBase(TraceParserBase): pl.col(name).cast(dtype) for name, dtype in infer_schema(df).items() ) - df = df.rename({'__timestamp': 'Time'}) + df = ( + df + .drop('Time', strict=False) + .rename({'__timestamp': 'Time'}) + ) schema = df.collect_schema() if event == 'sched_switch': @@ -2032,7 +2040,11 @@ class TxtTraceParserBase(TraceParserBase): # In-kernel name is "cpumask", "cpus" is just an artifact of the pretty # printing format string of ftrace, that happens to be used by a # specific parser. - df = df.rename({'cpus': 'cpumask'}) + df = ( + df + .drop('cpumask', strict=False) + .rename({'cpus': 'cpumask'}) + ) if event == 'thermal_power_cpu_get_power': if isinstance(schema['load'], (pl.String, pl.Binary, pl.Categorical)): -- GitLab