1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /* Copyright(c) 2016-2019 Intel Corporation. All rights reserved. */
3*4882a593Smuzhiyun #include <linux/memremap.h>
4*4882a593Smuzhiyun #include <linux/pagemap.h>
5*4882a593Smuzhiyun #include <linux/memory.h>
6*4882a593Smuzhiyun #include <linux/module.h>
7*4882a593Smuzhiyun #include <linux/device.h>
8*4882a593Smuzhiyun #include <linux/pfn_t.h>
9*4882a593Smuzhiyun #include <linux/slab.h>
10*4882a593Smuzhiyun #include <linux/dax.h>
11*4882a593Smuzhiyun #include <linux/fs.h>
12*4882a593Smuzhiyun #include <linux/mm.h>
13*4882a593Smuzhiyun #include <linux/mman.h>
14*4882a593Smuzhiyun #include "dax-private.h"
15*4882a593Smuzhiyun #include "bus.h"
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun /* Memory resource name used for add_memory_driver_managed(). */
18*4882a593Smuzhiyun static const char *kmem_name;
19*4882a593Smuzhiyun /* Set if any memory will remain added when the driver will be unloaded. */
20*4882a593Smuzhiyun static bool any_hotremove_failed;
21*4882a593Smuzhiyun
dax_kmem_range(struct dev_dax * dev_dax,int i,struct range * r)22*4882a593Smuzhiyun static int dax_kmem_range(struct dev_dax *dev_dax, int i, struct range *r)
23*4882a593Smuzhiyun {
24*4882a593Smuzhiyun struct dev_dax_range *dax_range = &dev_dax->ranges[i];
25*4882a593Smuzhiyun struct range *range = &dax_range->range;
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun /* memory-block align the hotplug range */
28*4882a593Smuzhiyun r->start = ALIGN(range->start, memory_block_size_bytes());
29*4882a593Smuzhiyun r->end = ALIGN_DOWN(range->end + 1, memory_block_size_bytes()) - 1;
30*4882a593Smuzhiyun if (r->start >= r->end) {
31*4882a593Smuzhiyun r->start = range->start;
32*4882a593Smuzhiyun r->end = range->end;
33*4882a593Smuzhiyun return -ENOSPC;
34*4882a593Smuzhiyun }
35*4882a593Smuzhiyun return 0;
36*4882a593Smuzhiyun }
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun struct dax_kmem_data {
39*4882a593Smuzhiyun const char *res_name;
40*4882a593Smuzhiyun struct resource *res[];
41*4882a593Smuzhiyun };
42*4882a593Smuzhiyun
dev_dax_kmem_probe(struct dev_dax * dev_dax)43*4882a593Smuzhiyun static int dev_dax_kmem_probe(struct dev_dax *dev_dax)
44*4882a593Smuzhiyun {
45*4882a593Smuzhiyun struct device *dev = &dev_dax->dev;
46*4882a593Smuzhiyun struct dax_kmem_data *data;
47*4882a593Smuzhiyun int rc = -ENOMEM;
48*4882a593Smuzhiyun int i, mapped = 0;
49*4882a593Smuzhiyun int numa_node;
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun /*
52*4882a593Smuzhiyun * Ensure good NUMA information for the persistent memory.
53*4882a593Smuzhiyun * Without this check, there is a risk that slow memory
54*4882a593Smuzhiyun * could be mixed in a node with faster memory, causing
55*4882a593Smuzhiyun * unavoidable performance issues.
56*4882a593Smuzhiyun */
57*4882a593Smuzhiyun numa_node = dev_dax->target_node;
58*4882a593Smuzhiyun if (numa_node < 0) {
59*4882a593Smuzhiyun dev_warn(dev, "rejecting DAX region with invalid node: %d\n",
60*4882a593Smuzhiyun numa_node);
61*4882a593Smuzhiyun return -EINVAL;
62*4882a593Smuzhiyun }
63*4882a593Smuzhiyun
64*4882a593Smuzhiyun data = kzalloc(sizeof(*data) + sizeof(struct resource *) * dev_dax->nr_range, GFP_KERNEL);
65*4882a593Smuzhiyun if (!data)
66*4882a593Smuzhiyun return -ENOMEM;
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun data->res_name = kstrdup(dev_name(dev), GFP_KERNEL);
69*4882a593Smuzhiyun if (!data->res_name)
70*4882a593Smuzhiyun goto err_res_name;
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun for (i = 0; i < dev_dax->nr_range; i++) {
73*4882a593Smuzhiyun struct resource *res;
74*4882a593Smuzhiyun struct range range;
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun rc = dax_kmem_range(dev_dax, i, &range);
77*4882a593Smuzhiyun if (rc) {
78*4882a593Smuzhiyun dev_info(dev, "mapping%d: %#llx-%#llx too small after alignment\n",
79*4882a593Smuzhiyun i, range.start, range.end);
80*4882a593Smuzhiyun continue;
81*4882a593Smuzhiyun }
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun /* Region is permanently reserved if hotremove fails. */
84*4882a593Smuzhiyun res = request_mem_region(range.start, range_len(&range), data->res_name);
85*4882a593Smuzhiyun if (!res) {
86*4882a593Smuzhiyun dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve region\n",
87*4882a593Smuzhiyun i, range.start, range.end);
88*4882a593Smuzhiyun /*
89*4882a593Smuzhiyun * Once some memory has been onlined we can't
90*4882a593Smuzhiyun * assume that it can be un-onlined safely.
91*4882a593Smuzhiyun */
92*4882a593Smuzhiyun if (mapped)
93*4882a593Smuzhiyun continue;
94*4882a593Smuzhiyun rc = -EBUSY;
95*4882a593Smuzhiyun goto err_request_mem;
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun data->res[i] = res;
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun /*
100*4882a593Smuzhiyun * Set flags appropriate for System RAM. Leave ..._BUSY clear
101*4882a593Smuzhiyun * so that add_memory() can add a child resource. Do not
102*4882a593Smuzhiyun * inherit flags from the parent since it may set new flags
103*4882a593Smuzhiyun * unknown to us that will break add_memory() below.
104*4882a593Smuzhiyun */
105*4882a593Smuzhiyun res->flags = IORESOURCE_SYSTEM_RAM;
106*4882a593Smuzhiyun
107*4882a593Smuzhiyun /*
108*4882a593Smuzhiyun * Ensure that future kexec'd kernels will not treat
109*4882a593Smuzhiyun * this as RAM automatically.
110*4882a593Smuzhiyun */
111*4882a593Smuzhiyun rc = add_memory_driver_managed(numa_node, range.start,
112*4882a593Smuzhiyun range_len(&range), kmem_name, MHP_NONE);
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun if (rc) {
115*4882a593Smuzhiyun dev_warn(dev, "mapping%d: %#llx-%#llx memory add failed\n",
116*4882a593Smuzhiyun i, range.start, range.end);
117*4882a593Smuzhiyun release_resource(res);
118*4882a593Smuzhiyun kfree(res);
119*4882a593Smuzhiyun data->res[i] = NULL;
120*4882a593Smuzhiyun if (mapped)
121*4882a593Smuzhiyun continue;
122*4882a593Smuzhiyun goto err_request_mem;
123*4882a593Smuzhiyun }
124*4882a593Smuzhiyun mapped++;
125*4882a593Smuzhiyun }
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun dev_set_drvdata(dev, data);
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun return 0;
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun err_request_mem:
132*4882a593Smuzhiyun kfree(data->res_name);
133*4882a593Smuzhiyun err_res_name:
134*4882a593Smuzhiyun kfree(data);
135*4882a593Smuzhiyun return rc;
136*4882a593Smuzhiyun }
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun #ifdef CONFIG_MEMORY_HOTREMOVE
dev_dax_kmem_remove(struct dev_dax * dev_dax)139*4882a593Smuzhiyun static int dev_dax_kmem_remove(struct dev_dax *dev_dax)
140*4882a593Smuzhiyun {
141*4882a593Smuzhiyun int i, success = 0;
142*4882a593Smuzhiyun struct device *dev = &dev_dax->dev;
143*4882a593Smuzhiyun struct dax_kmem_data *data = dev_get_drvdata(dev);
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun /*
146*4882a593Smuzhiyun * We have one shot for removing memory, if some memory blocks were not
147*4882a593Smuzhiyun * offline prior to calling this function remove_memory() will fail, and
148*4882a593Smuzhiyun * there is no way to hotremove this memory until reboot because device
149*4882a593Smuzhiyun * unbind will succeed even if we return failure.
150*4882a593Smuzhiyun */
151*4882a593Smuzhiyun for (i = 0; i < dev_dax->nr_range; i++) {
152*4882a593Smuzhiyun struct range range;
153*4882a593Smuzhiyun int rc;
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun rc = dax_kmem_range(dev_dax, i, &range);
156*4882a593Smuzhiyun if (rc)
157*4882a593Smuzhiyun continue;
158*4882a593Smuzhiyun
159*4882a593Smuzhiyun rc = remove_memory(dev_dax->target_node, range.start,
160*4882a593Smuzhiyun range_len(&range));
161*4882a593Smuzhiyun if (rc == 0) {
162*4882a593Smuzhiyun release_resource(data->res[i]);
163*4882a593Smuzhiyun kfree(data->res[i]);
164*4882a593Smuzhiyun data->res[i] = NULL;
165*4882a593Smuzhiyun success++;
166*4882a593Smuzhiyun continue;
167*4882a593Smuzhiyun }
168*4882a593Smuzhiyun any_hotremove_failed = true;
169*4882a593Smuzhiyun dev_err(dev,
170*4882a593Smuzhiyun "mapping%d: %#llx-%#llx cannot be hotremoved until the next reboot\n",
171*4882a593Smuzhiyun i, range.start, range.end);
172*4882a593Smuzhiyun }
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun if (success >= dev_dax->nr_range) {
175*4882a593Smuzhiyun kfree(data->res_name);
176*4882a593Smuzhiyun kfree(data);
177*4882a593Smuzhiyun dev_set_drvdata(dev, NULL);
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun return 0;
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun #else
dev_dax_kmem_remove(struct dev_dax * dev_dax)183*4882a593Smuzhiyun static int dev_dax_kmem_remove(struct dev_dax *dev_dax)
184*4882a593Smuzhiyun {
185*4882a593Smuzhiyun /*
186*4882a593Smuzhiyun * Without hotremove purposely leak the request_mem_region() for the
187*4882a593Smuzhiyun * device-dax range and return '0' to ->remove() attempts. The removal
188*4882a593Smuzhiyun * of the device from the driver always succeeds, but the region is
189*4882a593Smuzhiyun * permanently pinned as reserved by the unreleased
190*4882a593Smuzhiyun * request_mem_region().
191*4882a593Smuzhiyun */
192*4882a593Smuzhiyun any_hotremove_failed = true;
193*4882a593Smuzhiyun return 0;
194*4882a593Smuzhiyun }
195*4882a593Smuzhiyun #endif /* CONFIG_MEMORY_HOTREMOVE */
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun static struct dax_device_driver device_dax_kmem_driver = {
198*4882a593Smuzhiyun .probe = dev_dax_kmem_probe,
199*4882a593Smuzhiyun .remove = dev_dax_kmem_remove,
200*4882a593Smuzhiyun };
201*4882a593Smuzhiyun
dax_kmem_init(void)202*4882a593Smuzhiyun static int __init dax_kmem_init(void)
203*4882a593Smuzhiyun {
204*4882a593Smuzhiyun int rc;
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun /* Resource name is permanently allocated if any hotremove fails. */
207*4882a593Smuzhiyun kmem_name = kstrdup_const("System RAM (kmem)", GFP_KERNEL);
208*4882a593Smuzhiyun if (!kmem_name)
209*4882a593Smuzhiyun return -ENOMEM;
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun rc = dax_driver_register(&device_dax_kmem_driver);
212*4882a593Smuzhiyun if (rc)
213*4882a593Smuzhiyun kfree_const(kmem_name);
214*4882a593Smuzhiyun return rc;
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun
dax_kmem_exit(void)217*4882a593Smuzhiyun static void __exit dax_kmem_exit(void)
218*4882a593Smuzhiyun {
219*4882a593Smuzhiyun dax_driver_unregister(&device_dax_kmem_driver);
220*4882a593Smuzhiyun if (!any_hotremove_failed)
221*4882a593Smuzhiyun kfree_const(kmem_name);
222*4882a593Smuzhiyun }
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun MODULE_AUTHOR("Intel Corporation");
225*4882a593Smuzhiyun MODULE_LICENSE("GPL v2");
226*4882a593Smuzhiyun module_init(dax_kmem_init);
227*4882a593Smuzhiyun module_exit(dax_kmem_exit);
228*4882a593Smuzhiyun MODULE_ALIAS_DAX_DEVICE(0);
229