55 void record(
int tokens_generated, int64_t elapsed_ms);
81 float headroom = 0.9f,
82 int floor = 64)
const;
99 static constexpr double kAlpha = 0.3;
102 static constexpr int kMinTokens = 4;
104 std::atomic<double> ewma_tok_s_{0.0};
105 std::atomic<int> samples_{0};
EWMA-based throughput tracker for generation budgeting.
double tok_per_sec() const
Current smoothed throughput estimate.
int sample_count() const
Number of recorded samples.
void reset()
Reset all throughput data.
int recommend_tokens(int64_t time_budget_ms, float headroom=0.9f, int floor=64) const
Recommend max_tokens to fit within a time budget.
void record(int tokens_generated, int64_t elapsed_ms)
Record a completed generation sample.
int64_t predict_ms(int token_count) const
Predict wall-clock time for generating N tokens.
Activate model on GPU (WARM → ACTIVE).