Skip to content

Commit e6ffbb0

Browse files
hoshinolinamarcan
authored andcommitted
kboot_gpu: Add proper max-power and leakage calculations for t600x/t8103
Signed-off-by: Asahi Lina <[email protected]>
1 parent 057151c commit e6ffbb0

2 files changed

Lines changed: 201 additions & 21 deletions

File tree

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ OBJECTS := \
102102
i2c.o \
103103
iodev.o \
104104
iova.o \
105-
kboot.o kboot_gpu.o \
105+
kboot.o \
106106
main.o \
107107
mcc.o \
108108
memory.o memory_asm.o \
@@ -130,6 +130,7 @@ OBJECTS := \
130130
$(MINILZLIB_OBJECTS) $(TINF_OBJECTS) $(DLMALLOC_OBJECTS) $(LIBFDT_OBJECTS) $(RUST_LIBS)
131131

132132
FP_OBJECTS := \
133+
kboot_gpu.o \
133134
math/expf.o \
134135
math/exp2f_data.o \
135136

src/kboot_gpu.c

Lines changed: 199 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include "kboot.h"
44
#include "adt.h"
55
#include "assert.h"
6+
#include "math.h"
7+
#include "pmgr.h"
68
#include "soc.h"
79
#include "utils.h"
810

@@ -22,49 +24,129 @@ struct perf_state {
2224
u32 volt;
2325
};
2426

27+
static int get_core_counts(u32 *count, u32 nclusters, u32 ncores)
28+
{
29+
u64 base;
30+
pmgr_adt_power_enable("/arm-io/sgx");
31+
32+
int adt_sgx_path[8];
33+
if (adt_path_offset_trace(adt, "/arm-io/sgx", adt_sgx_path) < 0)
34+
bail("ADT: GPU: Failed to get sgx\n");
35+
36+
if (adt_get_reg(adt, adt_sgx_path, "reg", 0, &base, NULL) < 0)
37+
bail("ADT: GPU: Failed to get sgx reg 0\n");
38+
39+
u32 cores_lo = read32(base + 0xd01500);
40+
u32 cores_hi = read32(base + 0xd01514);
41+
42+
u64 cores = (((u64)cores_hi) << 32) | cores_lo;
43+
44+
for (u32 i = 0; i < nclusters; i++) {
45+
count[i] = __builtin_popcount(cores & MASK(ncores));
46+
cores >>= ncores;
47+
}
48+
49+
return 0;
50+
}
51+
52+
static void adjust_leakage(float *val, u32 clusters, u32 *cores, u32 max, float uncore_fraction)
53+
{
54+
for (u32 i = 0; i < clusters; i++) {
55+
float uncore = val[i] * uncore_fraction;
56+
float core = val[i] - uncore;
57+
58+
val[i] = uncore + (cores[i] / (float)max) * core;
59+
}
60+
}
61+
62+
static void load_fuses(float *out, u32 count, u64 base, u32 start, u32 width, float scale,
63+
float offset, bool flip)
64+
{
65+
for (u32 i = 0; i < count; i++) {
66+
base += (start / 32) * 4;
67+
start &= 31;
68+
69+
u32 low = read32(base);
70+
u32 high = read32(base + 4);
71+
u32 val = (((((u64)high) << 32) | low) >> start) & MASK(width);
72+
73+
float fval = (float)val * scale + offset;
74+
75+
if (flip)
76+
out[count - i - 1] = fval;
77+
else
78+
out[i] = fval;
79+
80+
start += width;
81+
}
82+
}
83+
2584
static u32 t8103_pwr_scale[] = {0, 63, 80, 108, 150, 198, 210};
2685

2786
// TODO this isn't a static table any more
2887
static u32 t8112_pwr_scale[] = {0, 66, 92, 119, 153, 184, 214, 240, 240};
2988

30-
static int calc_power_t81xx(int sgx, u32 count, u32 table_count,
31-
const struct perf_state *perf_states, u32 *max_pwr)
89+
static int calc_power_t81xx(u32 count, u32 table_count, const struct perf_state *core,
90+
const struct perf_state *sram, u32 *max_pwr, float *core_leak,
91+
float *sram_leak)
3292
{
33-
UNUSED(sgx);
93+
UNUSED(sram);
94+
UNUSED(core_leak);
95+
UNUSED(sram_leak);
3496
u32 *pwr_scale;
3597
u32 pwr_scale_count;
98+
u32 core_count;
99+
u32 max_cores;
36100

37101
switch (chip_id) {
38102
case T8103:
39103
pwr_scale = t8103_pwr_scale;
40104
pwr_scale_count = ARRAY_SIZE(t8103_pwr_scale);
105+
max_cores = 8;
41106
break;
42107
case T8112:
43108
pwr_scale = t8112_pwr_scale;
44109
pwr_scale_count = ARRAY_SIZE(t8112_pwr_scale);
45110
break;
111+
default:
112+
bail("ADT: GPU: Unsupported chip\n");
46113
}
47114

115+
if (get_core_counts(&core_count, 1, max_cores))
116+
return -1;
117+
48118
if (table_count != 1)
49119
bail("ADT: GPU: expected 1 perf state table but got %d\n", table_count);
50120

51121
if (count != pwr_scale_count)
52122
bail("ADT: GPU: expected %d perf states but got %d\n", pwr_scale_count, count);
53123

54124
for (u32 i = 0; i < pwr_scale_count; i++)
55-
max_pwr[i] = (u32)perf_states[i].volt * (u32)pwr_scale[i] * 100;
125+
max_pwr[i] = (u32)core[i].volt * (u32)pwr_scale[i] * 100;
126+
127+
core_leak[0] = 1000.0;
128+
sram_leak[0] = 45.0;
129+
130+
adjust_leakage(core_leak, 1, &core_count, max_cores, 0.12);
131+
adjust_leakage(sram_leak, 1, &core_count, max_cores, 0.2);
56132

57133
return 0;
58134
}
59135

60-
// TODO
61-
static u32 t6000_pwr_scale[] = {0, 15, 19, 25, 34, 50, 100};
62-
63-
static int calc_power_t600x(int sgx, u32 count, u32 table_count,
64-
const struct perf_state *perf_states, u32 *max_pwr)
136+
static int calc_power_t600x(u32 count, u32 table_count, const struct perf_state *core,
137+
const struct perf_state *sram, u32 *max_pwr, float *core_leak,
138+
float *sram_leak)
65139
{
66-
UNUSED(sgx);
67-
UNUSED(perf_states);
140+
const float s_sram = 4.3547606;
141+
const float k_sram = 0.024927923;
142+
143+
// macOS difference: macOS uses a misbehaved piecewise function here
144+
// Since it's obviously wrong, let's just use only the first component
145+
const float s_core = 1.48461742;
146+
const float k_core = 0.39013552;
147+
148+
const float dk_core = 8.558;
149+
const float dk_sram = 0.05;
68150

69151
u32 nclusters = 0;
70152
switch (chip_id) {
@@ -79,16 +161,64 @@ static int calc_power_t600x(int sgx, u32 count, u32 table_count,
79161
break;
80162
}
81163

82-
u32 pwr_scale_count = ARRAY_SIZE(t6000_pwr_scale);
164+
u32 core_count[MAX_CLUSTERS];
165+
166+
if (get_core_counts(core_count, nclusters, 8))
167+
return -1;
168+
169+
load_fuses(core_leak + 0, min(4, nclusters), 0x2922bc1b8, 25, 13, 2, 2, false);
170+
load_fuses(sram_leak + 0, min(4, nclusters), 0x2922bc1cc, 4, 9, 1, 1, false);
171+
172+
if (nclusters == 8) {
173+
load_fuses(core_leak + 4, 4, 0x22922bc1b8, 25, 13, 2, 2, true);
174+
load_fuses(sram_leak + 4, 4, 0x22922bc1cc, 4, 9, 1, 1, true);
175+
}
176+
177+
printf("FDT: GPU: Core counts: ");
178+
for (u32 i = 0; i < nclusters; i++) {
179+
printf("%d ", core_count[i]);
180+
}
181+
printf("\n");
182+
183+
adjust_leakage(core_leak, nclusters, core_count, 8, 0.0825);
184+
adjust_leakage(sram_leak, nclusters, core_count, 8, 0.2247);
83185

84186
if (table_count != nclusters)
85187
bail("ADT: GPU: expected %d perf state tables but got %d\n", nclusters, table_count);
86188

87-
if (count != pwr_scale_count)
88-
bail("ADT: GPU: expected %d perf states but got %d\n", pwr_scale_count, count);
189+
max_pwr[0] = 0;
190+
191+
for (u32 i = 1; i < count; i++) {
192+
u32 total_mw = 0;
193+
194+
for (u32 j = 0; j < nclusters; j++) {
195+
// macOS difference: macOS truncates Hz to integer MHz before doing this math.
196+
// That's probably wrong, so let's not do that.
89197

90-
for (u32 i = 0; i < pwr_scale_count; i++) {
91-
max_pwr[i] = t6000_pwr_scale[i] * 1667430 * nclusters / 8;
198+
float mw = 0;
199+
size_t idx = j * count + i;
200+
201+
mw += sram[idx].volt / 1000.f * sram_leak[j] * k_sram *
202+
expf(sram[idx].volt / 1000.f * s_sram);
203+
mw += core[idx].volt / 1000.f * core_leak[j] * k_core *
204+
expf(core[idx].volt / 1000.f * s_core);
205+
206+
float sbase = sram[idx].volt / 750.f;
207+
float sram_v_p = sbase * sbase * sbase;
208+
mw += dk_sram * (sram[idx].freq / 1000000.f) * sram_v_p;
209+
210+
float cbase = core[idx].volt / 750.f;
211+
float core_v_p;
212+
if (core[idx].volt > 750)
213+
core_v_p = cbase * cbase * cbase; // v ^ 3
214+
else
215+
core_v_p = cbase * cbase; // v ^ 2
216+
mw += dk_core * (core[idx].freq / 1000000.f) * core_v_p;
217+
218+
total_mw += mw;
219+
}
220+
221+
max_pwr[i] = total_mw * 1000;
92222
}
93223

94224
return 0;
@@ -122,10 +252,31 @@ static int dt_set_region(void *dt, int sgx, const char *name, const char *path)
122252
return 0;
123253
}
124254

255+
int fdt_set_float_array(void *dt, int node, const char *name, float *val, int count)
256+
{
257+
fdt32_t data[MAX_CLUSTERS];
258+
259+
if (count > MAX_CLUSTERS)
260+
bail("FDT: GPU: fdt_set_float_array() with too many values\n");
261+
262+
memcpy(data, val, sizeof(float) * count);
263+
for (int i = 0; i < count; i++) {
264+
data[i] = cpu_to_fdt32(data[i]);
265+
}
266+
267+
if (fdt_setprop_inplace(dt, node, name, data, sizeof(u32) * count))
268+
bail("FDT: GPU: Failed to set %s\n", name);
269+
270+
return 0;
271+
}
272+
125273
int dt_set_gpu(void *dt)
126274
{
127-
int (*calc_power)(int sgx, u32 count, u32 table_count, const struct perf_state *perf,
128-
u32 *max_pwr);
275+
int (*calc_power)(u32 count, u32 table_count, const struct perf_state *perf,
276+
const struct perf_state *sram, u32 *max_pwr, float *core_leak,
277+
float *sram_leak);
278+
279+
printf("FDT: GPU: Initializing GPU info\n");
129280

130281
switch (chip_id) {
131282
case T8103:
@@ -177,16 +328,44 @@ int dt_set_gpu(void *dt)
177328
bail("ADT: GPU: perf-state-table-count too large\n");
178329

179330
u32 perf_states_len;
180-
const struct perf_state *perf_states;
331+
const struct perf_state *perf_states, *perf_states_sram;
181332

182333
perf_states = adt_getprop(adt, sgx, "perf-states", &perf_states_len);
183334
if (!perf_states ||
184335
perf_states_len != sizeof(*perf_states) * perf_state_count * perf_state_table_count)
185336
bail("ADT: GPU: invalid perf-states length\n");
186337

338+
perf_states_sram = adt_getprop(adt, sgx, "perf-states-sram", &perf_states_len);
339+
if (perf_states_sram &&
340+
perf_states_len != sizeof(*perf_states) * perf_state_count * perf_state_table_count)
341+
bail("ADT: GPU: invalid perf-states-sram length\n");
342+
187343
u32 max_pwr[MAX_PSTATES];
344+
float core_leak[MAX_CLUSTERS];
345+
float sram_leak[MAX_CLUSTERS];
346+
347+
if (calc_power(perf_state_count, perf_state_table_count, perf_states, perf_states_sram, max_pwr,
348+
core_leak, sram_leak))
349+
return -1;
350+
351+
printf("FDT: GPU: Max power table: ");
352+
for (u32 i = 0; i < perf_state_count; i++) {
353+
printf("%d ", max_pwr[i]);
354+
}
355+
printf("\nFDT: GPU: Core leakage table: ");
356+
for (u32 i = 0; i < perf_state_table_count; i++) {
357+
printf("%d.%03d ", (int)core_leak[i], ((int)(core_leak[i] * 1000) % 1000));
358+
}
359+
printf("\nFDT: GPU: SRAM leakage table: ");
360+
for (u32 i = 0; i < perf_state_table_count; i++) {
361+
printf("%d.%03d ", (int)sram_leak[i], ((int)(sram_leak[i] * 1000) % 1000));
362+
}
363+
printf("\n");
364+
365+
if (fdt_set_float_array(dt, gpu, "apple,core-leak-coef", core_leak, perf_state_table_count))
366+
return -1;
188367

189-
if (calc_power(sgx, perf_state_count, perf_state_table_count, perf_states, max_pwr))
368+
if (fdt_set_float_array(dt, gpu, "apple,sram-leak-coef", sram_leak, perf_state_table_count))
190369
return -1;
191370

192371
u32 i = 0;

0 commit comments

Comments
 (0)