Commit 268a362c63dcd89754566b4e04c8311847c7eabb

Authored by aliguori
1 parent 754d00ae

added -numa cmdline parameter parser (Andre Przywara)

adds a -numa command line parameter and sets a QEMU global array with
the memory sizes. The CPU-to-node assignemnt is written into the
CPUState. If no specific values for memory and CPUs are given,
all resources will be split equally across all nodes.
This code currently support only up to 64 virtual CPUs.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>


git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@7210 c046a42c-6fe2-441c-8c8c-71466251a162
cpu-defs.h
@@ -205,6 +205,7 @@ typedef struct CPUWatchpoint { @@ -205,6 +205,7 @@ typedef struct CPUWatchpoint {
205 \ 205 \
206 CPUState *next_cpu; /* next CPU sharing TB cache */ \ 206 CPUState *next_cpu; /* next CPU sharing TB cache */ \
207 int cpu_index; /* CPU index (informative) */ \ 207 int cpu_index; /* CPU index (informative) */ \
  208 + int numa_node; /* NUMA node this cpu is belonging to */ \
208 int running; /* Nonzero if cpu is currently running(usermode). */ \ 209 int running; /* Nonzero if cpu is currently running(usermode). */ \
209 /* user data */ \ 210 /* user data */ \
210 void *opaque; \ 211 void *opaque; \
@@ -554,6 +554,7 @@ void cpu_exec_init(CPUState *env) @@ -554,6 +554,7 @@ void cpu_exec_init(CPUState *env)
554 cpu_index++; 554 cpu_index++;
555 } 555 }
556 env->cpu_index = cpu_index; 556 env->cpu_index = cpu_index;
  557 + env->numa_node = 0;
557 TAILQ_INIT(&env->breakpoints); 558 TAILQ_INIT(&env->breakpoints);
558 TAILQ_INIT(&env->watchpoints); 559 TAILQ_INIT(&env->watchpoints);
559 *penv = env; 560 *penv = env;
qemu-options.hx
@@ -47,6 +47,14 @@ CPUs are supported. On Sparc32 target, Linux limits the number of usable CPUs @@ -47,6 +47,14 @@ CPUs are supported. On Sparc32 target, Linux limits the number of usable CPUs
47 to 4. 47 to 4.
48 ETEXI 48 ETEXI
49 49
  50 +DEF("numa", HAS_ARG, QEMU_OPTION_numa,
  51 + "-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n")
  52 +STEXI
  53 +@item -numa @var{opts}
  54 +Simulate a multi node NUMA system. If mem and cpus are omitted, resources
  55 +are split equally.
  56 +ETEXI
  57 +
50 DEF("fda", HAS_ARG, QEMU_OPTION_fda, 58 DEF("fda", HAS_ARG, QEMU_OPTION_fda,
51 "-fda/-fdb file use 'file' as floppy disk 0/1 image\n") 59 "-fda/-fdb file use 'file' as floppy disk 0/1 image\n")
52 DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "") 60 DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "")
sysemu.h
@@ -108,6 +108,10 @@ extern int old_param; @@ -108,6 +108,10 @@ extern int old_param;
108 extern int kqemu_allowed; 108 extern int kqemu_allowed;
109 #endif 109 #endif
110 110
  111 +#define MAX_NODES 64
  112 +extern int nb_numa_nodes;
  113 +extern uint64_t node_mem[MAX_NODES];
  114 +
111 #define MAX_OPTION_ROMS 16 115 #define MAX_OPTION_ROMS 16
112 extern const char *option_rom[MAX_OPTION_ROMS]; 116 extern const char *option_rom[MAX_OPTION_ROMS];
113 extern int nb_option_roms; 117 extern int nb_option_roms;
@@ -248,7 +252,7 @@ void do_usb_add(Monitor *mon, const char *devname); @@ -248,7 +252,7 @@ void do_usb_add(Monitor *mon, const char *devname);
248 void do_usb_del(Monitor *mon, const char *devname); 252 void do_usb_del(Monitor *mon, const char *devname);
249 void usb_info(Monitor *mon); 253 void usb_info(Monitor *mon);
250 254
251 -const char *get_opt_name(char *buf, int buf_size, const char *p); 255 +const char *get_opt_name(char *buf, int buf_size, const char *p, char delim);
252 const char *get_opt_value(char *buf, int buf_size, const char *p); 256 const char *get_opt_value(char *buf, int buf_size, const char *p);
253 int get_param_value(char *buf, int buf_size, 257 int get_param_value(char *buf, int buf_size,
254 const char *tag, const char *str); 258 const char *tag, const char *str);
@@ -265,6 +265,10 @@ const char *prom_envs[MAX_PROM_ENVS]; @@ -265,6 +265,10 @@ const char *prom_envs[MAX_PROM_ENVS];
265 int nb_drives_opt; 265 int nb_drives_opt;
266 struct drive_opt drives_opt[MAX_DRIVES]; 266 struct drive_opt drives_opt[MAX_DRIVES];
267 267
  268 +int nb_numa_nodes;
  269 +uint64_t node_mem[MAX_NODES];
  270 +uint64_t node_cpumask[MAX_NODES];
  271 +
268 static CPUState *cur_cpu; 272 static CPUState *cur_cpu;
269 static CPUState *next_cpu; 273 static CPUState *next_cpu;
270 static int event_pending = 1; 274 static int event_pending = 1;
@@ -1865,12 +1869,12 @@ static int socket_init(void) @@ -1865,12 +1869,12 @@ static int socket_init(void)
1865 } 1869 }
1866 #endif 1870 #endif
1867 1871
1868 -const char *get_opt_name(char *buf, int buf_size, const char *p) 1872 +const char *get_opt_name(char *buf, int buf_size, const char *p, char delim)
1869 { 1873 {
1870 char *q; 1874 char *q;
1871 1875
1872 q = buf; 1876 q = buf;
1873 - while (*p != '\0' && *p != '=') { 1877 + while (*p != '\0' && *p != delim) {
1874 if (q && (q - buf) < buf_size - 1) 1878 if (q && (q - buf) < buf_size - 1)
1875 *q++ = *p; 1879 *q++ = *p;
1876 p++; 1880 p++;
@@ -1910,7 +1914,7 @@ int get_param_value(char *buf, int buf_size, @@ -1910,7 +1914,7 @@ int get_param_value(char *buf, int buf_size,
1910 1914
1911 p = str; 1915 p = str;
1912 for(;;) { 1916 for(;;) {
1913 - p = get_opt_name(option, sizeof(option), p); 1917 + p = get_opt_name(option, sizeof(option), p, '=');
1914 if (*p != '=') 1918 if (*p != '=')
1915 break; 1919 break;
1916 p++; 1920 p++;
@@ -1935,7 +1939,7 @@ int check_params(char *buf, int buf_size, @@ -1935,7 +1939,7 @@ int check_params(char *buf, int buf_size,
1935 1939
1936 p = str; 1940 p = str;
1937 while (*p != '\0') { 1941 while (*p != '\0') {
1938 - p = get_opt_name(buf, buf_size, p); 1942 + p = get_opt_name(buf, buf_size, p, '=');
1939 if (*p != '=') 1943 if (*p != '=')
1940 return -1; 1944 return -1;
1941 p++; 1945 p++;
@@ -2628,6 +2632,62 @@ int drive_init(struct drive_opt *arg, int snapshot, void *opaque) @@ -2628,6 +2632,62 @@ int drive_init(struct drive_opt *arg, int snapshot, void *opaque)
2628 return drives_table_idx; 2632 return drives_table_idx;
2629 } 2633 }
2630 2634
  2635 +static void numa_add(const char *optarg)
  2636 +{
  2637 + char option[128];
  2638 + char *endptr;
  2639 + unsigned long long value, endvalue;
  2640 + int nodenr;
  2641 +
  2642 + optarg = get_opt_name(option, 128, optarg, ',') + 1;
  2643 + if (!strcmp(option, "node")) {
  2644 + if (get_param_value(option, 128, "nodeid", optarg) == 0) {
  2645 + nodenr = nb_numa_nodes;
  2646 + } else {
  2647 + nodenr = strtoull(option, NULL, 10);
  2648 + }
  2649 +
  2650 + if (get_param_value(option, 128, "mem", optarg) == 0) {
  2651 + node_mem[nodenr] = 0;
  2652 + } else {
  2653 + value = strtoull(option, &endptr, 0);
  2654 + switch (*endptr) {
  2655 + case 0: case 'M': case 'm':
  2656 + value <<= 20;
  2657 + break;
  2658 + case 'G': case 'g':
  2659 + value <<= 30;
  2660 + break;
  2661 + }
  2662 + node_mem[nodenr] = value;
  2663 + }
  2664 + if (get_param_value(option, 128, "cpus", optarg) == 0) {
  2665 + node_cpumask[nodenr] = 0;
  2666 + } else {
  2667 + value = strtoull(option, &endptr, 10);
  2668 + if (value >= 64) {
  2669 + value = 63;
  2670 + fprintf(stderr, "only 64 CPUs in NUMA mode supported.\n");
  2671 + } else {
  2672 + if (*endptr == '-') {
  2673 + endvalue = strtoull(endptr+1, &endptr, 10);
  2674 + if (endvalue >= 63) {
  2675 + endvalue = 62;
  2676 + fprintf(stderr,
  2677 + "only 63 CPUs in NUMA mode supported.\n");
  2678 + }
  2679 + value = (1 << (endvalue + 1)) - (1 << value);
  2680 + } else {
  2681 + value = 1 << value;
  2682 + }
  2683 + }
  2684 + node_cpumask[nodenr] = value;
  2685 + }
  2686 + nb_numa_nodes++;
  2687 + }
  2688 + return;
  2689 +}
  2690 +
2631 /***********************************************************/ 2691 /***********************************************************/
2632 /* USB devices */ 2692 /* USB devices */
2633 2693
@@ -4290,6 +4350,7 @@ int main(int argc, char **argv, char **envp) @@ -4290,6 +4350,7 @@ int main(int argc, char **argv, char **envp)
4290 const char *chroot_dir = NULL; 4350 const char *chroot_dir = NULL;
4291 const char *run_as = NULL; 4351 const char *run_as = NULL;
4292 #endif 4352 #endif
  4353 + CPUState *env;
4293 4354
4294 qemu_cache_utils_init(envp); 4355 qemu_cache_utils_init(envp);
4295 4356
@@ -4353,12 +4414,18 @@ int main(int argc, char **argv, char **envp) @@ -4353,12 +4414,18 @@ int main(int argc, char **argv, char **envp)
4353 virtio_consoles[i] = NULL; 4414 virtio_consoles[i] = NULL;
4354 virtio_console_index = 0; 4415 virtio_console_index = 0;
4355 4416
  4417 + for (i = 0; i < MAX_NODES; i++) {
  4418 + node_mem[i] = 0;
  4419 + node_cpumask[i] = 0;
  4420 + }
  4421 +
4356 usb_devices_index = 0; 4422 usb_devices_index = 0;
4357 4423
4358 nb_net_clients = 0; 4424 nb_net_clients = 0;
4359 nb_bt_opts = 0; 4425 nb_bt_opts = 0;
4360 nb_drives = 0; 4426 nb_drives = 0;
4361 nb_drives_opt = 0; 4427 nb_drives_opt = 0;
  4428 + nb_numa_nodes = 0;
4362 hda_index = -1; 4429 hda_index = -1;
4363 4430
4364 nb_nics = 0; 4431 nb_nics = 0;
@@ -4508,6 +4575,13 @@ int main(int argc, char **argv, char **envp) @@ -4508,6 +4575,13 @@ int main(int argc, char **argv, char **envp)
4508 ",trans=none" : ""); 4575 ",trans=none" : "");
4509 } 4576 }
4510 break; 4577 break;
  4578 + case QEMU_OPTION_numa:
  4579 + if (nb_numa_nodes >= MAX_NODES) {
  4580 + fprintf(stderr, "qemu: too many NUMA nodes\n");
  4581 + exit(1);
  4582 + }
  4583 + numa_add(optarg);
  4584 + break;
4511 case QEMU_OPTION_nographic: 4585 case QEMU_OPTION_nographic:
4512 nographic = 1; 4586 nographic = 1;
4513 break; 4587 break;
@@ -5211,6 +5285,48 @@ int main(int argc, char **argv, char **envp) @@ -5211,6 +5285,48 @@ int main(int argc, char **argv, char **envp)
5211 } 5285 }
5212 } 5286 }
5213 5287
  5288 + if (nb_numa_nodes > 0) {
  5289 + int i;
  5290 +
  5291 + if (nb_numa_nodes > smp_cpus) {
  5292 + nb_numa_nodes = smp_cpus;
  5293 + }
  5294 +
  5295 + /* If no memory size if given for any node, assume the default case
  5296 + * and distribute the available memory equally across all nodes
  5297 + */
  5298 + for (i = 0; i < nb_numa_nodes; i++) {
  5299 + if (node_mem[i] != 0)
  5300 + break;
  5301 + }
  5302 + if (i == nb_numa_nodes) {
  5303 + uint64_t usedmem = 0;
  5304 +
  5305 + /* On Linux, the each node's border has to be 8MB aligned,
  5306 + * the final node gets the rest.
  5307 + */
  5308 + for (i = 0; i < nb_numa_nodes - 1; i++) {
  5309 + node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1);
  5310 + usedmem += node_mem[i];
  5311 + }
  5312 + node_mem[i] = ram_size - usedmem;
  5313 + }
  5314 +
  5315 + for (i = 0; i < nb_numa_nodes; i++) {
  5316 + if (node_cpumask[i] != 0)
  5317 + break;
  5318 + }
  5319 + /* assigning the VCPUs round-robin is easier to implement, guest OSes
  5320 + * must cope with this anyway, because there are BIOSes out there in
  5321 + * real machines which also use this scheme.
  5322 + */
  5323 + if (i == nb_numa_nodes) {
  5324 + for (i = 0; i < smp_cpus; i++) {
  5325 + node_cpumask[i % nb_numa_nodes] |= 1 << i;
  5326 + }
  5327 + }
  5328 + }
  5329 +
5214 if (kvm_enabled()) { 5330 if (kvm_enabled()) {
5215 int ret; 5331 int ret;
5216 5332
@@ -5274,6 +5390,15 @@ int main(int argc, char **argv, char **envp) @@ -5274,6 +5390,15 @@ int main(int argc, char **argv, char **envp)
5274 machine->init(ram_size, vga_ram_size, boot_devices, 5390 machine->init(ram_size, vga_ram_size, boot_devices,
5275 kernel_filename, kernel_cmdline, initrd_filename, cpu_model); 5391 kernel_filename, kernel_cmdline, initrd_filename, cpu_model);
5276 5392
  5393 +
  5394 + for (env = first_cpu; env != NULL; env = env->next_cpu) {
  5395 + for (i = 0; i < nb_numa_nodes; i++) {
  5396 + if (node_cpumask[i] & (1 << env->cpu_index)) {
  5397 + env->numa_node = i;
  5398 + }
  5399 + }
  5400 + }
  5401 +
5277 current_machine = machine; 5402 current_machine = machine;
5278 5403
5279 /* Set KVM's vcpu state to qemu's initial CPUState. */ 5404 /* Set KVM's vcpu state to qemu's initial CPUState. */