Commit 268a362c63dcd89754566b4e04c8311847c7eabb

Authored by aliguori
1 parent 754d00ae

added -numa cmdline parameter parser (Andre Przywara)

adds a -numa command line parameter and sets a QEMU global array with
the memory sizes. The CPU-to-node assignemnt is written into the
CPUState. If no specific values for memory and CPUs are given,
all resources will be split equally across all nodes.
This code currently support only up to 64 virtual CPUs.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>


git-svn-id: svn://svn.savannah.nongnu.org/qemu/trunk@7210 c046a42c-6fe2-441c-8c8c-71466251a162
cpu-defs.h
... ... @@ -205,6 +205,7 @@ typedef struct CPUWatchpoint {
205 205 \
206 206 CPUState *next_cpu; /* next CPU sharing TB cache */ \
207 207 int cpu_index; /* CPU index (informative) */ \
  208 + int numa_node; /* NUMA node this cpu is belonging to */ \
208 209 int running; /* Nonzero if cpu is currently running(usermode). */ \
209 210 /* user data */ \
210 211 void *opaque; \
... ...
... ... @@ -554,6 +554,7 @@ void cpu_exec_init(CPUState *env)
554 554 cpu_index++;
555 555 }
556 556 env->cpu_index = cpu_index;
  557 + env->numa_node = 0;
557 558 TAILQ_INIT(&env->breakpoints);
558 559 TAILQ_INIT(&env->watchpoints);
559 560 *penv = env;
... ...
qemu-options.hx
... ... @@ -47,6 +47,14 @@ CPUs are supported. On Sparc32 target, Linux limits the number of usable CPUs
47 47 to 4.
48 48 ETEXI
49 49  
  50 +DEF("numa", HAS_ARG, QEMU_OPTION_numa,
  51 + "-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n")
  52 +STEXI
  53 +@item -numa @var{opts}
  54 +Simulate a multi node NUMA system. If mem and cpus are omitted, resources
  55 +are split equally.
  56 +ETEXI
  57 +
50 58 DEF("fda", HAS_ARG, QEMU_OPTION_fda,
51 59 "-fda/-fdb file use 'file' as floppy disk 0/1 image\n")
52 60 DEF("fdb", HAS_ARG, QEMU_OPTION_fdb, "")
... ...
sysemu.h
... ... @@ -108,6 +108,10 @@ extern int old_param;
108 108 extern int kqemu_allowed;
109 109 #endif
110 110  
  111 +#define MAX_NODES 64
  112 +extern int nb_numa_nodes;
  113 +extern uint64_t node_mem[MAX_NODES];
  114 +
111 115 #define MAX_OPTION_ROMS 16
112 116 extern const char *option_rom[MAX_OPTION_ROMS];
113 117 extern int nb_option_roms;
... ... @@ -248,7 +252,7 @@ void do_usb_add(Monitor *mon, const char *devname);
248 252 void do_usb_del(Monitor *mon, const char *devname);
249 253 void usb_info(Monitor *mon);
250 254  
251   -const char *get_opt_name(char *buf, int buf_size, const char *p);
  255 +const char *get_opt_name(char *buf, int buf_size, const char *p, char delim);
252 256 const char *get_opt_value(char *buf, int buf_size, const char *p);
253 257 int get_param_value(char *buf, int buf_size,
254 258 const char *tag, const char *str);
... ...
... ... @@ -265,6 +265,10 @@ const char *prom_envs[MAX_PROM_ENVS];
265 265 int nb_drives_opt;
266 266 struct drive_opt drives_opt[MAX_DRIVES];
267 267  
  268 +int nb_numa_nodes;
  269 +uint64_t node_mem[MAX_NODES];
  270 +uint64_t node_cpumask[MAX_NODES];
  271 +
268 272 static CPUState *cur_cpu;
269 273 static CPUState *next_cpu;
270 274 static int event_pending = 1;
... ... @@ -1865,12 +1869,12 @@ static int socket_init(void)
1865 1869 }
1866 1870 #endif
1867 1871  
1868   -const char *get_opt_name(char *buf, int buf_size, const char *p)
  1872 +const char *get_opt_name(char *buf, int buf_size, const char *p, char delim)
1869 1873 {
1870 1874 char *q;
1871 1875  
1872 1876 q = buf;
1873   - while (*p != '\0' && *p != '=') {
  1877 + while (*p != '\0' && *p != delim) {
1874 1878 if (q && (q - buf) < buf_size - 1)
1875 1879 *q++ = *p;
1876 1880 p++;
... ... @@ -1910,7 +1914,7 @@ int get_param_value(char *buf, int buf_size,
1910 1914  
1911 1915 p = str;
1912 1916 for(;;) {
1913   - p = get_opt_name(option, sizeof(option), p);
  1917 + p = get_opt_name(option, sizeof(option), p, '=');
1914 1918 if (*p != '=')
1915 1919 break;
1916 1920 p++;
... ... @@ -1935,7 +1939,7 @@ int check_params(char *buf, int buf_size,
1935 1939  
1936 1940 p = str;
1937 1941 while (*p != '\0') {
1938   - p = get_opt_name(buf, buf_size, p);
  1942 + p = get_opt_name(buf, buf_size, p, '=');
1939 1943 if (*p != '=')
1940 1944 return -1;
1941 1945 p++;
... ... @@ -2628,6 +2632,62 @@ int drive_init(struct drive_opt *arg, int snapshot, void *opaque)
2628 2632 return drives_table_idx;
2629 2633 }
2630 2634  
  2635 +static void numa_add(const char *optarg)
  2636 +{
  2637 + char option[128];
  2638 + char *endptr;
  2639 + unsigned long long value, endvalue;
  2640 + int nodenr;
  2641 +
  2642 + optarg = get_opt_name(option, 128, optarg, ',') + 1;
  2643 + if (!strcmp(option, "node")) {
  2644 + if (get_param_value(option, 128, "nodeid", optarg) == 0) {
  2645 + nodenr = nb_numa_nodes;
  2646 + } else {
  2647 + nodenr = strtoull(option, NULL, 10);
  2648 + }
  2649 +
  2650 + if (get_param_value(option, 128, "mem", optarg) == 0) {
  2651 + node_mem[nodenr] = 0;
  2652 + } else {
  2653 + value = strtoull(option, &endptr, 0);
  2654 + switch (*endptr) {
  2655 + case 0: case 'M': case 'm':
  2656 + value <<= 20;
  2657 + break;
  2658 + case 'G': case 'g':
  2659 + value <<= 30;
  2660 + break;
  2661 + }
  2662 + node_mem[nodenr] = value;
  2663 + }
  2664 + if (get_param_value(option, 128, "cpus", optarg) == 0) {
  2665 + node_cpumask[nodenr] = 0;
  2666 + } else {
  2667 + value = strtoull(option, &endptr, 10);
  2668 + if (value >= 64) {
  2669 + value = 63;
  2670 + fprintf(stderr, "only 64 CPUs in NUMA mode supported.\n");
  2671 + } else {
  2672 + if (*endptr == '-') {
  2673 + endvalue = strtoull(endptr+1, &endptr, 10);
  2674 + if (endvalue >= 63) {
  2675 + endvalue = 62;
  2676 + fprintf(stderr,
  2677 + "only 63 CPUs in NUMA mode supported.\n");
  2678 + }
  2679 + value = (1 << (endvalue + 1)) - (1 << value);
  2680 + } else {
  2681 + value = 1 << value;
  2682 + }
  2683 + }
  2684 + node_cpumask[nodenr] = value;
  2685 + }
  2686 + nb_numa_nodes++;
  2687 + }
  2688 + return;
  2689 +}
  2690 +
2631 2691 /***********************************************************/
2632 2692 /* USB devices */
2633 2693  
... ... @@ -4290,6 +4350,7 @@ int main(int argc, char **argv, char **envp)
4290 4350 const char *chroot_dir = NULL;
4291 4351 const char *run_as = NULL;
4292 4352 #endif
  4353 + CPUState *env;
4293 4354  
4294 4355 qemu_cache_utils_init(envp);
4295 4356  
... ... @@ -4353,12 +4414,18 @@ int main(int argc, char **argv, char **envp)
4353 4414 virtio_consoles[i] = NULL;
4354 4415 virtio_console_index = 0;
4355 4416  
  4417 + for (i = 0; i < MAX_NODES; i++) {
  4418 + node_mem[i] = 0;
  4419 + node_cpumask[i] = 0;
  4420 + }
  4421 +
4356 4422 usb_devices_index = 0;
4357 4423  
4358 4424 nb_net_clients = 0;
4359 4425 nb_bt_opts = 0;
4360 4426 nb_drives = 0;
4361 4427 nb_drives_opt = 0;
  4428 + nb_numa_nodes = 0;
4362 4429 hda_index = -1;
4363 4430  
4364 4431 nb_nics = 0;
... ... @@ -4508,6 +4575,13 @@ int main(int argc, char **argv, char **envp)
4508 4575 ",trans=none" : "");
4509 4576 }
4510 4577 break;
  4578 + case QEMU_OPTION_numa:
  4579 + if (nb_numa_nodes >= MAX_NODES) {
  4580 + fprintf(stderr, "qemu: too many NUMA nodes\n");
  4581 + exit(1);
  4582 + }
  4583 + numa_add(optarg);
  4584 + break;
4511 4585 case QEMU_OPTION_nographic:
4512 4586 nographic = 1;
4513 4587 break;
... ... @@ -5211,6 +5285,48 @@ int main(int argc, char **argv, char **envp)
5211 5285 }
5212 5286 }
5213 5287  
  5288 + if (nb_numa_nodes > 0) {
  5289 + int i;
  5290 +
  5291 + if (nb_numa_nodes > smp_cpus) {
  5292 + nb_numa_nodes = smp_cpus;
  5293 + }
  5294 +
  5295 + /* If no memory size if given for any node, assume the default case
  5296 + * and distribute the available memory equally across all nodes
  5297 + */
  5298 + for (i = 0; i < nb_numa_nodes; i++) {
  5299 + if (node_mem[i] != 0)
  5300 + break;
  5301 + }
  5302 + if (i == nb_numa_nodes) {
  5303 + uint64_t usedmem = 0;
  5304 +
  5305 + /* On Linux, the each node's border has to be 8MB aligned,
  5306 + * the final node gets the rest.
  5307 + */
  5308 + for (i = 0; i < nb_numa_nodes - 1; i++) {
  5309 + node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1);
  5310 + usedmem += node_mem[i];
  5311 + }
  5312 + node_mem[i] = ram_size - usedmem;
  5313 + }
  5314 +
  5315 + for (i = 0; i < nb_numa_nodes; i++) {
  5316 + if (node_cpumask[i] != 0)
  5317 + break;
  5318 + }
  5319 + /* assigning the VCPUs round-robin is easier to implement, guest OSes
  5320 + * must cope with this anyway, because there are BIOSes out there in
  5321 + * real machines which also use this scheme.
  5322 + */
  5323 + if (i == nb_numa_nodes) {
  5324 + for (i = 0; i < smp_cpus; i++) {
  5325 + node_cpumask[i % nb_numa_nodes] |= 1 << i;
  5326 + }
  5327 + }
  5328 + }
  5329 +
5214 5330 if (kvm_enabled()) {
5215 5331 int ret;
5216 5332  
... ... @@ -5274,6 +5390,15 @@ int main(int argc, char **argv, char **envp)
5274 5390 machine->init(ram_size, vga_ram_size, boot_devices,
5275 5391 kernel_filename, kernel_cmdline, initrd_filename, cpu_model);
5276 5392  
  5393 +
  5394 + for (env = first_cpu; env != NULL; env = env->next_cpu) {
  5395 + for (i = 0; i < nb_numa_nodes; i++) {
  5396 + if (node_cpumask[i] & (1 << env->cpu_index)) {
  5397 + env->numa_node = i;
  5398 + }
  5399 + }
  5400 + }
  5401 +
5277 5402 current_machine = machine;
5278 5403  
5279 5404 /* Set KVM's vcpu state to qemu's initial CPUState. */
... ...