From 100a8fdbf525bb11796692a713c267be6523a890 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 29 Jul 2014 11:50:48 +0100 Subject: thermal: trace: Trace temperature changes Create a new event to trace the temperature of a thermal zone. Using this event trace the temperature changes of the thermal zone every-time it is updated. Cc: Zhang Rui Cc: Eduardo Valentin Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Punit Agrawal Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_core.c | 4 ++++ include/trace/events/thermal.h | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 include/trace/events/thermal.h diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 71b0ec0c370d..6b32391260a0 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -38,6 +38,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include "thermal_core.h" #include "thermal_hwmon.h" @@ -463,6 +466,7 @@ static void update_temperature(struct thermal_zone_device *tz) tz->temperature = temp; mutex_unlock(&tz->lock); + trace_thermal_temperature(tz); dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n", tz->last_temperature, tz->temperature); } diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h new file mode 100644 index 000000000000..8c5ca96eccd6 --- /dev/null +++ b/include/trace/events/thermal.h @@ -0,0 +1,38 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM thermal + +#if !defined(_TRACE_THERMAL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_THERMAL_H + +#include +#include + +TRACE_EVENT(thermal_temperature, + + TP_PROTO(struct thermal_zone_device *tz), + + TP_ARGS(tz), + + TP_STRUCT__entry( + __string(thermal_zone, tz->type) + __field(int, id) + __field(int, temp_prev) + __field(int, temp) + ), + + TP_fast_assign( + __assign_str(thermal_zone, tz->type); + __entry->id = tz->id; + __entry->temp_prev = tz->last_temperature; + __entry->temp = tz->temperature; + ), + + TP_printk("thermal_zone=%s id=%d temp_prev=%d temp=%d", + __get_str(thermal_zone), __entry->id, __entry->temp_prev, + __entry->temp) +); + +#endif /* _TRACE_THERMAL_H */ + +/* This part must be outside protection */ +#include -- cgit v1.2.3 From 39811569e43a81417bc0ddca3d0c7658c3dcd4b0 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 29 Jul 2014 11:50:49 +0100 Subject: thermal: trace: Trace when a cooling device's state is updated Introduce and use an event to trace when a cooling device's state is updated. This is useful to follow the effect of governor decisions on cooling devices. Cc: Zhang Rui Cc: Eduardo Valentin Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Punit Agrawal Signed-off-by: Eduardo Valentin --- drivers/thermal/thermal_core.c | 1 + include/trace/events/thermal.h | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 6b32391260a0..c74c78d28699 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1291,6 +1291,7 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev) mutex_unlock(&cdev->lock); cdev->ops->set_cur_state(cdev, target); cdev->updated = true; + trace_cdev_update(cdev, target); dev_dbg(&cdev->device, "set to state %lu\n", target); } EXPORT_SYMBOL(thermal_cdev_update); diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h index 8c5ca96eccd6..894a79ea0686 100644 --- a/include/trace/events/thermal.h +++ b/include/trace/events/thermal.h @@ -32,6 +32,25 @@ TRACE_EVENT(thermal_temperature, __entry->temp) ); +TRACE_EVENT(cdev_update, + + TP_PROTO(struct thermal_cooling_device *cdev, unsigned long target), + + TP_ARGS(cdev, target), + + TP_STRUCT__entry( + __string(type, cdev->type) + __field(unsigned long, target) + ), + + TP_fast_assign( + __assign_str(type, cdev->type); + __entry->target = target; + ), + + TP_printk("type=%s target=%lu", __get_str(type), __entry->target) +); + #endif /* _TRACE_THERMAL_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 208cd822a19e683bc890f6708786f2420e172d76 Mon Sep 17 00:00:00 2001 From: Punit Agrawal Date: Tue, 29 Jul 2014 11:50:50 +0100 Subject: thermal: trace: Trace when temperature is above a trip point Create a new event to trace when the temperature is above a trip point. Use the trace-point when handling non-critical and critical trip pionts. Cc: Zhang Rui Cc: Eduardo Valentin Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Signed-off-by: Punit Agrawal Signed-off-by: Eduardo Valentin --- drivers/thermal/fair_share.c | 12 ++++++++++++ drivers/thermal/step_wise.c | 5 ++++- drivers/thermal/thermal_core.c | 2 ++ include/trace/events/thermal.h | 26 ++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c index 944ba2f340c8..6e0a3fbfae86 100644 --- a/drivers/thermal/fair_share.c +++ b/drivers/thermal/fair_share.c @@ -23,6 +23,7 @@ */ #include +#include #include "thermal_core.h" @@ -34,6 +35,7 @@ static int get_trip_level(struct thermal_zone_device *tz) { int count = 0; unsigned long trip_temp; + enum thermal_trip_type trip_type; if (tz->trips == 0 || !tz->ops->get_trip_temp) return 0; @@ -43,6 +45,16 @@ static int get_trip_level(struct thermal_zone_device *tz) if (tz->temperature < trip_temp) break; } + + /* + * count > 0 only if temperature is greater than first trip + * point, in which case, trip_point = count - 1 + */ + if (count > 0) { + tz->ops->get_trip_type(tz, count - 1, &trip_type); + trace_thermal_zone_trip(tz, count - 1, trip_type); + } + return count; } diff --git a/drivers/thermal/step_wise.c b/drivers/thermal/step_wise.c index f251521baaa2..3b54c2c226d8 100644 --- a/drivers/thermal/step_wise.c +++ b/drivers/thermal/step_wise.c @@ -23,6 +23,7 @@ */ #include +#include #include "thermal_core.h" @@ -129,8 +130,10 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip) trend = get_tz_trend(tz, trip); - if (tz->temperature >= trip_temp) + if (tz->temperature >= trip_temp) { throttle = true; + trace_thermal_zone_trip(tz, trip, trip_type); + } dev_dbg(&tz->device, "Trip%d[type=%d,temp=%ld]:trend=%d,throttle=%d\n", trip, trip_type, trip_temp, trend, throttle); diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index c74c78d28699..454884aa15f7 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -371,6 +371,8 @@ static void handle_critical_trips(struct thermal_zone_device *tz, if (tz->temperature < trip_temp) return; + trace_thermal_zone_trip(tz, trip, trip_type); + if (tz->ops->notify) tz->ops->notify(tz, trip, trip_type); diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h index 894a79ea0686..0f4f95d63c03 100644 --- a/include/trace/events/thermal.h +++ b/include/trace/events/thermal.h @@ -51,6 +51,32 @@ TRACE_EVENT(cdev_update, TP_printk("type=%s target=%lu", __get_str(type), __entry->target) ); +TRACE_EVENT(thermal_zone_trip, + + TP_PROTO(struct thermal_zone_device *tz, int trip, + enum thermal_trip_type trip_type), + + TP_ARGS(tz, trip, trip_type), + + TP_STRUCT__entry( + __string(thermal_zone, tz->type) + __field(int, id) + __field(int, trip) + __field(enum thermal_trip_type, trip_type) + ), + + TP_fast_assign( + __assign_str(thermal_zone, tz->type); + __entry->id = tz->id; + __entry->trip = trip; + __entry->trip_type = trip_type; + ), + + TP_printk("thermal_zone=%s id=%d trip=%d trip_type=%d", + __get_str(thermal_zone), __entry->id, __entry->trip, + __entry->trip_type) +); + #endif /* _TRACE_THERMAL_H */ /* This part must be outside protection */ -- cgit v1.2.3 From a020279ee611c31ac163c00b21b975e7ecbb2e9c Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Fri, 25 Jul 2014 15:31:58 +0530 Subject: thermal: add support to disable thermal zone from DTS Add support to check status of the thermal zone before registering the zone. This will help on disabling some non-existing thermal zone from the top level DTS file out of common dtsi thermalzone file. For example, we have 3 platforms almost same but thermal zones on this platform are little bit different. Platform 1 and 2 have three thermal zones and platform 3 has two thermal zones. To avoid duplication of the thermal zones entries on each DTS file of platforms,we created one common dtsi file for thermal zone and included this dtsi file from these 3 platform's top level dts file. On common thermal zone com dtsi file, all thermal zone are enabled and need to disable one of thermal zone on platform 3 dts file. For this, we just added entry status = "disabled" for that thermal zone on platform 3 dts file and along with this change to make it work. This way, we reuse the common file and control the enable/disable of the thermal zone from top level dts file. Signed-off-by: Laxman Dewangan Signed-off-by: Eduardo Valentin --- drivers/thermal/of-thermal.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c index 4b2b999b7611..f8eb625b8400 100644 --- a/drivers/thermal/of-thermal.c +++ b/drivers/thermal/of-thermal.c @@ -401,6 +401,10 @@ thermal_zone_of_sensor_register(struct device *dev, int sensor_id, struct of_phandle_args sensor_specs; int ret, id; + /* Check whether child is enabled or not */ + if (!of_device_is_available(child)) + continue; + /* For now, thermal framework supports only 1 sensor per zone */ ret = of_parse_phandle_with_args(child, "thermal-sensors", "#thermal-sensor-cells", @@ -771,6 +775,10 @@ int __init of_parse_thermal_zones(void) struct thermal_zone_device *zone; struct thermal_zone_params *tzp; + /* Check whether child is enabled or not */ + if (!of_device_is_available(child)) + continue; + tz = thermal_of_build_thermal_zone(child); if (IS_ERR(tz)) { pr_err("failed to build thermal zone %s: %ld\n", @@ -838,6 +846,10 @@ void of_thermal_destroy_zones(void) for_each_child_of_node(np, child) { struct thermal_zone_device *zone; + /* Check whether child is enabled or not */ + if (!of_device_is_available(child)) + continue; + zone = thermal_zone_get_zone_by_name(child->name); if (IS_ERR(zone)) continue; -- cgit v1.2.3 From 3c94f17e72a7bcf689756da100b6051e535c45f4 Mon Sep 17 00:00:00 2001 From: Anson Huang Date: Wed, 6 Aug 2014 15:12:09 +0800 Subject: Thermal: imx: add i.mx6sx thermal support i.MX6SX has some new features of thermal interrupt function, there are LOW, HIGH and PANIC irq for thermal sensor, so add platform data to separate different thermal version; The reset value of LOW ALARM is 0 which means the highest temp, so the LOW ALARM will be triggered once irq is enabled, so we need to correct it before enabling thermal irq; Enable PANIC ALARM as critical trip point, it will trigger system reset via SRC module once PANIC IRQ is triggered, it is pure hardware function, so use it instead of software reset by cooling device. Signed-off-by: Anson Huang Tested-by: Shawn Guo Signed-off-by: Eduardo Valentin --- .../devicetree/bindings/thermal/imx-thermal.txt | 5 +- drivers/thermal/imx_thermal.c | 91 ++++++++++++++++++---- 2 files changed, 82 insertions(+), 14 deletions(-) diff --git a/Documentation/devicetree/bindings/thermal/imx-thermal.txt b/Documentation/devicetree/bindings/thermal/imx-thermal.txt index 1f0f67234a91..3c67bd50aa10 100644 --- a/Documentation/devicetree/bindings/thermal/imx-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/imx-thermal.txt @@ -1,7 +1,10 @@ * Temperature Monitor (TEMPMON) on Freescale i.MX SoCs Required properties: -- compatible : "fsl,imx6q-thermal" +- compatible : "fsl,imx6q-tempmon" for i.MX6Q, "fsl,imx6sx-tempmon" for i.MX6SX. + i.MX6SX has two more IRQs than i.MX6Q, one is IRQ_LOW and the other is IRQ_PANIC, + when temperature is below than low threshold, IRQ_LOW will be triggered, when temperature + is higher than panic threshold, system will auto reboot by SRC module. - fsl,tempmon : phandle pointer to system controller that contains TEMPMON control registers, e.g. ANATOP on imx6q. - fsl,tempmon-data : phandle pointer to fuse controller that contains TEMPMON diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index 2c516f2eebed..461bf3d033a0 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,11 @@ #define MISC0 0x0150 #define MISC0_REFTOP_SELBIASOFF (1 << 3) +#define MISC1 0x0160 +#define MISC1_IRQ_TEMPHIGH (1 << 29) +/* Below LOW and PANIC bits are only for TEMPMON_IMX6SX */ +#define MISC1_IRQ_TEMPLOW (1 << 28) +#define MISC1_IRQ_TEMPPANIC (1 << 27) #define TEMPSENSE0 0x0180 #define TEMPSENSE0_ALARM_VALUE_SHIFT 20 @@ -43,6 +49,12 @@ #define TEMPSENSE1 0x0190 #define TEMPSENSE1_MEASURE_FREQ 0xffff +/* Below TEMPSENSE2 is only for TEMPMON_IMX6SX */ +#define TEMPSENSE2 0x0290 +#define TEMPSENSE2_LOW_VALUE_SHIFT 0 +#define TEMPSENSE2_LOW_VALUE_MASK 0xfff +#define TEMPSENSE2_PANIC_VALUE_SHIFT 16 +#define TEMPSENSE2_PANIC_VALUE_MASK 0xfff0000 #define OCOTP_ANA1 0x04e0 @@ -66,6 +78,21 @@ enum imx_thermal_trip { #define FACTOR1 15976 #define FACTOR2 4297157 +#define TEMPMON_IMX6Q 1 +#define TEMPMON_IMX6SX 2 + +struct thermal_soc_data { + u32 version; +}; + +static struct thermal_soc_data thermal_imx6q_data = { + .version = TEMPMON_IMX6Q, +}; + +static struct thermal_soc_data thermal_imx6sx_data = { + .version = TEMPMON_IMX6SX, +}; + struct imx_thermal_data { struct thermal_zone_device *tz; struct thermal_cooling_device *cdev; @@ -79,8 +106,21 @@ struct imx_thermal_data { bool irq_enabled; int irq; struct clk *thermal_clk; + const struct thermal_soc_data *socdata; }; +static void imx_set_panic_temp(struct imx_thermal_data *data, + signed long panic_temp) +{ + struct regmap *map = data->tempmon; + int critical_value; + + critical_value = (data->c2 - panic_temp) / data->c1; + regmap_write(map, TEMPSENSE2 + REG_CLR, TEMPSENSE2_PANIC_VALUE_MASK); + regmap_write(map, TEMPSENSE2 + REG_SET, critical_value << + TEMPSENSE2_PANIC_VALUE_SHIFT); +} + static void imx_set_alarm_temp(struct imx_thermal_data *data, signed long alarm_temp) { @@ -142,13 +182,17 @@ static int imx_get_temp(struct thermal_zone_device *tz, unsigned long *temp) /* See imx_get_sensor_data() for formula derivation */ *temp = data->c2 - n_meas * data->c1; - /* Update alarm value to next higher trip point */ - if (data->alarm_temp == data->temp_passive && *temp >= data->temp_passive) - imx_set_alarm_temp(data, data->temp_critical); - if (data->alarm_temp == data->temp_critical && *temp < data->temp_passive) { - imx_set_alarm_temp(data, data->temp_passive); - dev_dbg(&tz->device, "thermal alarm off: T < %lu\n", - data->alarm_temp / 1000); + /* Update alarm value to next higher trip point for TEMPMON_IMX6Q */ + if (data->socdata->version == TEMPMON_IMX6Q) { + if (data->alarm_temp == data->temp_passive && + *temp >= data->temp_passive) + imx_set_alarm_temp(data, data->temp_critical); + if (data->alarm_temp == data->temp_critical && + *temp < data->temp_passive) { + imx_set_alarm_temp(data, data->temp_passive); + dev_dbg(&tz->device, "thermal alarm off: T < %lu\n", + data->alarm_temp / 1000); + } } if (*temp != data->last_temp) { @@ -398,8 +442,17 @@ static irqreturn_t imx_thermal_alarm_irq_thread(int irq, void *dev) return IRQ_HANDLED; } +static const struct of_device_id of_imx_thermal_match[] = { + { .compatible = "fsl,imx6q-tempmon", .data = &thermal_imx6q_data, }, + { .compatible = "fsl,imx6sx-tempmon", .data = &thermal_imx6sx_data, }, + { /* end */ } +}; +MODULE_DEVICE_TABLE(of, of_imx_thermal_match); + static int imx_thermal_probe(struct platform_device *pdev) { + const struct of_device_id *of_id = + of_match_device(of_imx_thermal_match, &pdev->dev); struct imx_thermal_data *data; struct cpumask clip_cpus; struct regmap *map; @@ -418,6 +471,20 @@ static int imx_thermal_probe(struct platform_device *pdev) } data->tempmon = map; + data->socdata = of_id->data; + + /* make sure the IRQ flag is clear before enabling irq on i.MX6SX */ + if (data->socdata->version == TEMPMON_IMX6SX) { + regmap_write(map, MISC1 + REG_CLR, MISC1_IRQ_TEMPHIGH | + MISC1_IRQ_TEMPLOW | MISC1_IRQ_TEMPPANIC); + /* + * reset value of LOW ALARM is incorrect, set it to lowest + * value to avoid false trigger of low alarm. + */ + regmap_write(map, TEMPSENSE2 + REG_SET, + TEMPSENSE2_LOW_VALUE_MASK); + } + data->irq = platform_get_irq(pdev, 0); if (data->irq < 0) return data->irq; @@ -489,6 +556,10 @@ static int imx_thermal_probe(struct platform_device *pdev) measure_freq = DIV_ROUND_UP(32768, 10); /* 10 Hz */ regmap_write(map, TEMPSENSE1 + REG_SET, measure_freq); imx_set_alarm_temp(data, data->temp_passive); + + if (data->socdata->version == TEMPMON_IMX6SX) + imx_set_panic_temp(data, data->temp_critical); + regmap_write(map, TEMPSENSE0 + REG_CLR, TEMPSENSE0_POWER_DOWN); regmap_write(map, TEMPSENSE0 + REG_SET, TEMPSENSE0_MEASURE_TEMP); @@ -550,12 +621,6 @@ static int imx_thermal_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(imx_thermal_pm_ops, imx_thermal_suspend, imx_thermal_resume); -static const struct of_device_id of_imx_thermal_match[] = { - { .compatible = "fsl,imx6q-tempmon", }, - { /* end */ } -}; -MODULE_DEVICE_TABLE(of, of_imx_thermal_match); - static struct platform_driver imx_thermal = { .driver = { .name = "imx_thermal", -- cgit v1.2.3 From cd6d92d2aa1556b22cd05acbc5f2cc8e5caafcc4 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 19 Aug 2014 12:38:01 +0800 Subject: pwm: fsl-ftm: Clean up the code This patch intends to prepare for converting to direct regmap API usage. Signed-off-by: Xiubo Li Signed-off-by: Thierry Reding --- drivers/pwm/pwm-fsl-ftm.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c index a18bc8fea385..96982da52d86 100644 --- a/drivers/pwm/pwm-fsl-ftm.c +++ b/drivers/pwm/pwm-fsl-ftm.c @@ -21,11 +21,10 @@ #include #define FTM_SC 0x00 -#define FTM_SC_CLK_MASK 0x3 -#define FTM_SC_CLK_SHIFT 3 -#define FTM_SC_CLK(c) (((c) + 1) << FTM_SC_CLK_SHIFT) +#define FTM_SC_CLK_MASK_SHIFT 3 +#define FTM_SC_CLK_MASK (3 << FTM_SC_CLK_MASK_SHIFT) +#define FTM_SC_CLK(c) (((c) + 1) << FTM_SC_CLK_MASK_SHIFT) #define FTM_SC_PS_MASK 0x7 -#define FTM_SC_PS_SHIFT 0 #define FTM_CNT 0x04 #define FTM_MOD 0x08 @@ -258,7 +257,7 @@ static int fsl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, } val = readl(fpc->base + FTM_SC); - val &= ~(FTM_SC_PS_MASK << FTM_SC_PS_SHIFT); + val &= ~FTM_SC_PS_MASK; val |= fpc->clk_ps; writel(val, fpc->base + FTM_SC); writel(period - 1, fpc->base + FTM_MOD); @@ -305,7 +304,7 @@ static int fsl_counter_clock_enable(struct fsl_pwm_chip *fpc) /* select counter clock source */ val = readl(fpc->base + FTM_SC); - val &= ~(FTM_SC_CLK_MASK << FTM_SC_CLK_SHIFT); + val &= ~FTM_SC_CLK_MASK; val |= FTM_SC_CLK(fpc->cnt_select); writel(val, fpc->base + FTM_SC); @@ -357,7 +356,7 @@ static void fsl_counter_clock_disable(struct fsl_pwm_chip *fpc) /* no users left, disable PWM counter clock */ val = readl(fpc->base + FTM_SC); - val &= ~(FTM_SC_CLK_MASK << FTM_SC_CLK_SHIFT); + val &= ~FTM_SC_CLK_MASK; writel(val, fpc->base + FTM_SC); clk_disable_unprepare(fpc->clk[FSL_PWM_CLK_CNTEN]); -- cgit v1.2.3 From 42fa98a9c3609c1aff466cb847e421c611cc9157 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 19 Aug 2014 12:38:02 +0800 Subject: pwm: fsl-ftm: Convert to direct regmap API usage The regmap core supports different endian modes for devices. This patch convert to direct regmap API usage, preparing to support big endianness for LS1 SoC. Using the regmap framework it will be easy to support devices that only differ in endianness with the same device driver. Signed-off-by: Xiubo Li Signed-off-by: Thierry Reding --- drivers/pwm/pwm-fsl-ftm.c | 83 +++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/drivers/pwm/pwm-fsl-ftm.c b/drivers/pwm/pwm-fsl-ftm.c index 96982da52d86..0f2cc7ef7784 100644 --- a/drivers/pwm/pwm-fsl-ftm.c +++ b/drivers/pwm/pwm-fsl-ftm.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #define FTM_SC 0x00 @@ -82,7 +83,7 @@ struct fsl_pwm_chip { unsigned int cnt_select; unsigned int clk_ps; - void __iomem *base; + struct regmap *regmap; int period_ns; @@ -218,10 +219,11 @@ static unsigned long fsl_pwm_calculate_duty(struct fsl_pwm_chip *fpc, unsigned long period_ns, unsigned long duty_ns) { - unsigned long long val, duty; + unsigned long long duty; + u32 val; - val = readl(fpc->base + FTM_MOD); - duty = duty_ns * (val + 1); + regmap_read(fpc->regmap, FTM_MOD, &val); + duty = (unsigned long long)duty_ns * (val + 1); do_div(duty, period_ns); return (unsigned long)duty; @@ -231,7 +233,7 @@ static int fsl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct fsl_pwm_chip *fpc = to_fsl_chip(chip); - u32 val, period, duty; + u32 period, duty; mutex_lock(&fpc->lock); @@ -256,11 +258,9 @@ static int fsl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return -EINVAL; } - val = readl(fpc->base + FTM_SC); - val &= ~FTM_SC_PS_MASK; - val |= fpc->clk_ps; - writel(val, fpc->base + FTM_SC); - writel(period - 1, fpc->base + FTM_MOD); + regmap_update_bits(fpc->regmap, FTM_SC, FTM_SC_PS_MASK, + fpc->clk_ps); + regmap_write(fpc->regmap, FTM_MOD, period - 1); fpc->period_ns = period_ns; } @@ -269,8 +269,9 @@ static int fsl_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, duty = fsl_pwm_calculate_duty(fpc, period_ns, duty_ns); - writel(FTM_CSC_MSB | FTM_CSC_ELSB, fpc->base + FTM_CSC(pwm->hwpwm)); - writel(duty, fpc->base + FTM_CV(pwm->hwpwm)); + regmap_write(fpc->regmap, FTM_CSC(pwm->hwpwm), + FTM_CSC_MSB | FTM_CSC_ELSB); + regmap_write(fpc->regmap, FTM_CV(pwm->hwpwm), duty); return 0; } @@ -282,31 +283,28 @@ static int fsl_pwm_set_polarity(struct pwm_chip *chip, struct fsl_pwm_chip *fpc = to_fsl_chip(chip); u32 val; - val = readl(fpc->base + FTM_POL); + regmap_read(fpc->regmap, FTM_POL, &val); if (polarity == PWM_POLARITY_INVERSED) val |= BIT(pwm->hwpwm); else val &= ~BIT(pwm->hwpwm); - writel(val, fpc->base + FTM_POL); + regmap_write(fpc->regmap, FTM_POL, val); return 0; } static int fsl_counter_clock_enable(struct fsl_pwm_chip *fpc) { - u32 val; int ret; if (fpc->use_count != 0) return 0; /* select counter clock source */ - val = readl(fpc->base + FTM_SC); - val &= ~FTM_SC_CLK_MASK; - val |= FTM_SC_CLK(fpc->cnt_select); - writel(val, fpc->base + FTM_SC); + regmap_update_bits(fpc->regmap, FTM_SC, FTM_SC_CLK_MASK, + FTM_SC_CLK(fpc->cnt_select)); ret = clk_prepare_enable(fpc->clk[fpc->cnt_select]); if (ret) @@ -326,13 +324,10 @@ static int fsl_counter_clock_enable(struct fsl_pwm_chip *fpc) static int fsl_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) { struct fsl_pwm_chip *fpc = to_fsl_chip(chip); - u32 val; int ret; mutex_lock(&fpc->lock); - val = readl(fpc->base + FTM_OUTMASK); - val &= ~BIT(pwm->hwpwm); - writel(val, fpc->base + FTM_OUTMASK); + regmap_update_bits(fpc->regmap, FTM_OUTMASK, BIT(pwm->hwpwm), 0); ret = fsl_counter_clock_enable(fpc); mutex_unlock(&fpc->lock); @@ -342,8 +337,6 @@ static int fsl_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) static void fsl_counter_clock_disable(struct fsl_pwm_chip *fpc) { - u32 val; - /* * already disabled, do nothing */ @@ -355,9 +348,7 @@ static void fsl_counter_clock_disable(struct fsl_pwm_chip *fpc) return; /* no users left, disable PWM counter clock */ - val = readl(fpc->base + FTM_SC); - val &= ~FTM_SC_CLK_MASK; - writel(val, fpc->base + FTM_SC); + regmap_update_bits(fpc->regmap, FTM_SC, FTM_SC_CLK_MASK, 0); clk_disable_unprepare(fpc->clk[FSL_PWM_CLK_CNTEN]); clk_disable_unprepare(fpc->clk[fpc->cnt_select]); @@ -369,14 +360,12 @@ static void fsl_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) u32 val; mutex_lock(&fpc->lock); - val = readl(fpc->base + FTM_OUTMASK); - val |= BIT(pwm->hwpwm); - writel(val, fpc->base + FTM_OUTMASK); + regmap_update_bits(fpc->regmap, FTM_OUTMASK, BIT(pwm->hwpwm), + BIT(pwm->hwpwm)); fsl_counter_clock_disable(fpc); - val = readl(fpc->base + FTM_OUTMASK); - + regmap_read(fpc->regmap, FTM_OUTMASK, &val); if ((val & 0xFF) == 0xFF) fpc->period_ns = 0; @@ -401,19 +390,28 @@ static int fsl_pwm_init(struct fsl_pwm_chip *fpc) if (ret) return ret; - writel(0x00, fpc->base + FTM_CNTIN); - writel(0x00, fpc->base + FTM_OUTINIT); - writel(0xFF, fpc->base + FTM_OUTMASK); + regmap_write(fpc->regmap, FTM_CNTIN, 0x00); + regmap_write(fpc->regmap, FTM_OUTINIT, 0x00); + regmap_write(fpc->regmap, FTM_OUTMASK, 0xFF); clk_disable_unprepare(fpc->clk[FSL_PWM_CLK_SYS]); return 0; } +static const struct regmap_config fsl_pwm_regmap_config = { + .reg_bits = 32, + .reg_stride = 4, + .val_bits = 32, + + .max_register = FTM_PWMLOAD, +}; + static int fsl_pwm_probe(struct platform_device *pdev) { struct fsl_pwm_chip *fpc; struct resource *res; + void __iomem *base; int ret; fpc = devm_kzalloc(&pdev->dev, sizeof(*fpc), GFP_KERNEL); @@ -425,9 +423,16 @@ static int fsl_pwm_probe(struct platform_device *pdev) fpc->chip.dev = &pdev->dev; res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - fpc->base = devm_ioremap_resource(&pdev->dev, res); - if (IS_ERR(fpc->base)) - return PTR_ERR(fpc->base); + base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(base)) + return PTR_ERR(base); + + fpc->regmap = devm_regmap_init_mmio_clk(&pdev->dev, NULL, base, + &fsl_pwm_regmap_config); + if (IS_ERR(fpc->regmap)) { + dev_err(&pdev->dev, "regmap init failed\n"); + return PTR_ERR(fpc->regmap); + } fpc->clk[FSL_PWM_CLK_SYS] = devm_clk_get(&pdev->dev, "ftm_sys"); if (IS_ERR(fpc->clk[FSL_PWM_CLK_SYS])) { -- cgit v1.2.3 From a535e2e0debc2255fcf60a11d73fbb0534454cc3 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Tue, 19 Aug 2014 12:38:03 +0800 Subject: pwm: fsl-ftm: Document 'big-endian' property The same FTM PWM device can have a different endianness on different SoCs. The device tree provides a property to describing this so that an operating system device driver can handle all variants of the device. Refer to the table below for the endianness of the FTM PWM block as integrated into the existing SoCs: SoC | FTM-PWM endianness --------+------------------- Vybrid | LE LS1 | BE LS2 | LE Signed-off-by: Xiubo Li Signed-off-by: Thierry Reding --- Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt b/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt index 0bda229a6171..3899d6a557c1 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-fsl-ftm.txt @@ -1,5 +1,20 @@ Freescale FlexTimer Module (FTM) PWM controller +The same FTM PWM device can have a different endianness on different SoCs. The +device tree provides a property to describing this so that an operating system +device driver can handle all variants of the device. Refer to the table below +for the endianness of the FTM PWM block as integrated into the existing SoCs: + + SoC | FTM-PWM endianness + --------+------------------- + Vybrid | LE + LS1 | BE + LS2 | LE + +Please see ../regmap/regmap.txt for more detail about how to specify endian +modes in device tree. + + Required properties: - compatible: Should be "fsl,vf610-ftm-pwm". - reg: Physical base address and length of the controller's registers @@ -16,7 +31,8 @@ Required properties: - pinctrl-names: Must contain a "default" entry. - pinctrl-NNN: One property must exist for each entry in pinctrl-names. See pinctrl/pinctrl-bindings.txt for details of the property values. - +- big-endian: Boolean property, required if the FTM PWM registers use a big- + endian rather than little-endian layout. Example: @@ -32,4 +48,5 @@ pwm0: pwm@40038000 { <&clks VF610_CLK_FTM0_EXT_FIX_EN>; pinctrl-names = "default"; pinctrl-0 = <&pinctrl_pwm0_1>; + big-endian; }; -- cgit v1.2.3 From 00018a8ae5c552a2464e0df15437511ba4f56495 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Thu, 21 Aug 2014 20:50:25 -0300 Subject: pwm: fsl-ftm: Select REGMAP_MMIO Commit 42fa98a9c360 ("pwm: fsl-ftm: Convert to direct regmap API usage") introduced the following error when REGMAP_MMIO=n: drivers/built-in.o: In function `fsl_pwm_probe': >> pwm-fsl-ftm.c:(.text+0xd7d7): undefined reference to `devm_regmap_init_mmio_clk' Select select REGMAP_MMIO in order to fix this error. Reported-by: kbuild test robot Signed-off-by: Fabio Estevam Signed-off-by: Thierry Reding --- drivers/pwm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index b800783800a3..e56e91e4fde6 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -101,6 +101,7 @@ config PWM_EP93XX config PWM_FSL_FTM tristate "Freescale FlexTimer Module (FTM) PWM support" depends on OF + select REGMAP_MMIO help Generic FTM PWM framework driver for Freescale VF610 and Layerscape LS-1 SoCs. -- cgit v1.2.3 From 373c57829a3f9da1405b1fbd3d17e50f8e1f476e Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Tue, 19 Aug 2014 17:18:29 +0300 Subject: pwm: lpss: Add ACPI and PCI IDs for Intel Braswell This is pretty much the same as Baytrail PWM. Only difference is that the input clock runs on different frequency. Signed-off-by: Alan Cox Signed-off-by: Mika Westerberg Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lpss.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c index 4df994f72d96..d04eee7aa967 100644 --- a/drivers/pwm/pwm-lpss.c +++ b/drivers/pwm/pwm-lpss.c @@ -48,6 +48,11 @@ static const struct pwm_lpss_boardinfo byt_info = { 25000000 }; +/* Braswell */ +static const struct pwm_lpss_boardinfo bsw_info = { + 19200000 +}; + static inline struct pwm_lpss_chip *to_lpwm(struct pwm_chip *chip) { return container_of(chip, struct pwm_lpss_chip, chip); @@ -189,6 +194,8 @@ static void pwm_lpss_remove_pci(struct pci_dev *pdev) static struct pci_device_id pwm_lpss_pci_ids[] = { { PCI_VDEVICE(INTEL, 0x0f08), (unsigned long)&byt_info}, { PCI_VDEVICE(INTEL, 0x0f09), (unsigned long)&byt_info}, + { PCI_VDEVICE(INTEL, 0x2288), (unsigned long)&bsw_info}, + { PCI_VDEVICE(INTEL, 0x2289), (unsigned long)&bsw_info}, { }, }; MODULE_DEVICE_TABLE(pci, pwm_lpss_pci_ids); @@ -231,6 +238,7 @@ static int pwm_lpss_remove_platform(struct platform_device *pdev) static const struct acpi_device_id pwm_lpss_acpi_match[] = { { "80860F09", (unsigned long)&byt_info }, + { "80862288", (unsigned long)&bsw_info }, { }, }; MODULE_DEVICE_TABLE(acpi, pwm_lpss_acpi_match); -- cgit v1.2.3 From c558e39e14c2372394f49e07fbe94e9708b615cb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 19 Aug 2014 19:17:35 +0300 Subject: pwm: lpss: Properly split driver to parts The driver consists of core, PCI, and platform parts. It would be better to split them into separate files. The platform driver is now called pwm-lpss-platform. Thus, previously set CONFIG_PWM_LPSS=m is not enough to build it. But we are on the safe side since it seems no one from outside Intel is using it for now. While here, move to use macros module_pci_driver() and module_platform_driver(). Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg Acked-by: Alan Cox [thierry.reding: change select to depends on PWM_LPSS, cleanup] Signed-off-by: Thierry Reding --- drivers/pwm/Kconfig | 19 +++++- drivers/pwm/Makefile | 2 + drivers/pwm/pwm-lpss-pci.c | 65 +++++++++++++++++++ drivers/pwm/pwm-lpss-platform.c | 68 ++++++++++++++++++++ drivers/pwm/pwm-lpss.c | 136 +++------------------------------------- drivers/pwm/pwm-lpss.h | 32 ++++++++++ 6 files changed, 195 insertions(+), 127 deletions(-) create mode 100644 drivers/pwm/pwm-lpss-pci.c create mode 100644 drivers/pwm/pwm-lpss-platform.c create mode 100644 drivers/pwm/pwm-lpss.h diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index e56e91e4fde6..090741635f71 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -150,7 +150,6 @@ config PWM_LPC32XX config PWM_LPSS tristate "Intel LPSS PWM support" - depends on ACPI help Generic PWM framework driver for Intel Low Power Subsystem PWM controller. @@ -158,6 +157,24 @@ config PWM_LPSS To compile this driver as a module, choose M here: the module will be called pwm-lpss. +config PWM_LPSS_PCI + tristate "Intel LPSS PWM PCI driver" + depends on PWM_LPSS && PCI + help + The PCI driver for Intel Low Power Subsystem PWM controller. + + To compile this driver as a module, choose M here: the module + will be called pwm-lpss-pci. + +config PWM_LPSS_PLATFORM + tristate "Intel LPSS PWM platform driver" + depends on PWM_LPSS && ACPI + help + The platform driver for Intel Low Power Subsystem PWM controller. + + To compile this driver as a module, choose M here: the module + will be called pwm-lpss-platform. + config PWM_MXS tristate "Freescale MXS PWM support" depends on ARCH_MXS && OF diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile index f8c577d41091..c458606c3755 100644 --- a/drivers/pwm/Makefile +++ b/drivers/pwm/Makefile @@ -13,6 +13,8 @@ obj-$(CONFIG_PWM_JZ4740) += pwm-jz4740.o obj-$(CONFIG_PWM_LP3943) += pwm-lp3943.o obj-$(CONFIG_PWM_LPC32XX) += pwm-lpc32xx.o obj-$(CONFIG_PWM_LPSS) += pwm-lpss.o +obj-$(CONFIG_PWM_LPSS_PCI) += pwm-lpss-pci.o +obj-$(CONFIG_PWM_LPSS_PLATFORM) += pwm-lpss-platform.o obj-$(CONFIG_PWM_MXS) += pwm-mxs.o obj-$(CONFIG_PWM_PCA9685) += pwm-pca9685.o obj-$(CONFIG_PWM_PUV3) += pwm-puv3.o diff --git a/drivers/pwm/pwm-lpss-pci.c b/drivers/pwm/pwm-lpss-pci.c new file mode 100644 index 000000000000..1bfdd89c329c --- /dev/null +++ b/drivers/pwm/pwm-lpss-pci.c @@ -0,0 +1,65 @@ +/* + * Intel Low Power Subsystem PWM controller PCI driver + * + * Copyright (C) 2014, Intel Corporation + * + * Derived from the original pwm-lpss.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include "pwm-lpss.h" + +static int pwm_lpss_probe_pci(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + const struct pwm_lpss_boardinfo *info; + struct pwm_lpss_chip *lpwm; + int err; + + err = pci_enable_device(pdev); + if (err < 0) + return err; + + info = (struct pwm_lpss_boardinfo *)id->driver_data; + lpwm = pwm_lpss_probe(&pdev->dev, &pdev->resource[0], info); + if (IS_ERR(lpwm)) + return PTR_ERR(lpwm); + + pci_set_drvdata(pdev, lpwm); + return 0; +} + +static void pwm_lpss_remove_pci(struct pci_dev *pdev) +{ + struct pwm_lpss_chip *lpwm = pci_get_drvdata(pdev); + + pwm_lpss_remove(lpwm); + pci_disable_device(pdev); +} + +static const struct pci_device_id pwm_lpss_pci_ids[] = { + { PCI_VDEVICE(INTEL, 0x0f08), (unsigned long)&pwm_lpss_byt_info}, + { PCI_VDEVICE(INTEL, 0x0f09), (unsigned long)&pwm_lpss_byt_info}, + { PCI_VDEVICE(INTEL, 0x2288), (unsigned long)&pwm_lpss_bsw_info}, + { PCI_VDEVICE(INTEL, 0x2289), (unsigned long)&pwm_lpss_bsw_info}, + { }, +}; +MODULE_DEVICE_TABLE(pci, pwm_lpss_pci_ids); + +static struct pci_driver pwm_lpss_driver_pci = { + .name = "pwm-lpss", + .id_table = pwm_lpss_pci_ids, + .probe = pwm_lpss_probe_pci, + .remove = pwm_lpss_remove_pci, +}; +module_pci_driver(pwm_lpss_driver_pci); + +MODULE_DESCRIPTION("PWM PCI driver for Intel LPSS"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pwm/pwm-lpss-platform.c b/drivers/pwm/pwm-lpss-platform.c new file mode 100644 index 000000000000..18a9c880a76d --- /dev/null +++ b/drivers/pwm/pwm-lpss-platform.c @@ -0,0 +1,68 @@ +/* + * Intel Low Power Subsystem PWM controller driver + * + * Copyright (C) 2014, Intel Corporation + * + * Derived from the original pwm-lpss.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include "pwm-lpss.h" + +static int pwm_lpss_probe_platform(struct platform_device *pdev) +{ + const struct pwm_lpss_boardinfo *info; + const struct acpi_device_id *id; + struct pwm_lpss_chip *lpwm; + struct resource *r; + + id = acpi_match_device(pdev->dev.driver->acpi_match_table, &pdev->dev); + if (!id) + return -ENODEV; + + info = (const struct pwm_lpss_boardinfo *)id->driver_data; + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + + lpwm = pwm_lpss_probe(&pdev->dev, r, info); + if (IS_ERR(lpwm)) + return PTR_ERR(lpwm); + + platform_set_drvdata(pdev, lpwm); + return 0; +} + +static int pwm_lpss_remove_platform(struct platform_device *pdev) +{ + struct pwm_lpss_chip *lpwm = platform_get_drvdata(pdev); + + return pwm_lpss_remove(lpwm); +} + +static const struct acpi_device_id pwm_lpss_acpi_match[] = { + { "80860F09", (unsigned long)&pwm_lpss_byt_info }, + { "80862288", (unsigned long)&pwm_lpss_bsw_info }, + { }, +}; +MODULE_DEVICE_TABLE(acpi, pwm_lpss_acpi_match); + +static struct platform_driver pwm_lpss_driver_platform = { + .driver = { + .name = "pwm-lpss", + .acpi_match_table = pwm_lpss_acpi_match, + }, + .probe = pwm_lpss_probe_platform, + .remove = pwm_lpss_remove_platform, +}; +module_platform_driver(pwm_lpss_driver_platform); + +MODULE_DESCRIPTION("PWM platform driver for Intel LPSS"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:pwm-lpss"); diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c index d04eee7aa967..ce9bf147811f 100644 --- a/drivers/pwm/pwm-lpss.c +++ b/drivers/pwm/pwm-lpss.c @@ -13,15 +13,10 @@ * published by the Free Software Foundation. */ -#include -#include #include #include -#include -#include -#include -static int pci_drv, plat_drv; /* So we know which drivers registered */ +#include "pwm-lpss.h" #define PWM 0x00000000 #define PWM_ENABLE BIT(31) @@ -39,19 +34,17 @@ struct pwm_lpss_chip { unsigned long clk_rate; }; -struct pwm_lpss_boardinfo { - unsigned long clk_rate; -}; - /* BayTrail */ -static const struct pwm_lpss_boardinfo byt_info = { +const struct pwm_lpss_boardinfo pwm_lpss_byt_info = { 25000000 }; +EXPORT_SYMBOL_GPL(pwm_lpss_byt_info); /* Braswell */ -static const struct pwm_lpss_boardinfo bsw_info = { +const struct pwm_lpss_boardinfo pwm_lpss_bsw_info = { 19200000 }; +EXPORT_SYMBOL_GPL(pwm_lpss_bsw_info); static inline struct pwm_lpss_chip *to_lpwm(struct pwm_chip *chip) { @@ -123,9 +116,8 @@ static const struct pwm_ops pwm_lpss_ops = { .owner = THIS_MODULE, }; -static struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, - struct resource *r, - const struct pwm_lpss_boardinfo *info) +struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, struct resource *r, + const struct pwm_lpss_boardinfo *info) { struct pwm_lpss_chip *lpwm; int ret; @@ -152,8 +144,9 @@ static struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, return lpwm; } +EXPORT_SYMBOL_GPL(pwm_lpss_probe); -static int pwm_lpss_remove(struct pwm_lpss_chip *lpwm) +int pwm_lpss_remove(struct pwm_lpss_chip *lpwm) { u32 ctrl; @@ -162,117 +155,8 @@ static int pwm_lpss_remove(struct pwm_lpss_chip *lpwm) return pwmchip_remove(&lpwm->chip); } - -static int pwm_lpss_probe_pci(struct pci_dev *pdev, - const struct pci_device_id *id) -{ - const struct pwm_lpss_boardinfo *info; - struct pwm_lpss_chip *lpwm; - int err; - - err = pci_enable_device(pdev); - if (err < 0) - return err; - - info = (struct pwm_lpss_boardinfo *)id->driver_data; - lpwm = pwm_lpss_probe(&pdev->dev, &pdev->resource[0], info); - if (IS_ERR(lpwm)) - return PTR_ERR(lpwm); - - pci_set_drvdata(pdev, lpwm); - return 0; -} - -static void pwm_lpss_remove_pci(struct pci_dev *pdev) -{ - struct pwm_lpss_chip *lpwm = pci_get_drvdata(pdev); - - pwm_lpss_remove(lpwm); - pci_disable_device(pdev); -} - -static struct pci_device_id pwm_lpss_pci_ids[] = { - { PCI_VDEVICE(INTEL, 0x0f08), (unsigned long)&byt_info}, - { PCI_VDEVICE(INTEL, 0x0f09), (unsigned long)&byt_info}, - { PCI_VDEVICE(INTEL, 0x2288), (unsigned long)&bsw_info}, - { PCI_VDEVICE(INTEL, 0x2289), (unsigned long)&bsw_info}, - { }, -}; -MODULE_DEVICE_TABLE(pci, pwm_lpss_pci_ids); - -static struct pci_driver pwm_lpss_driver_pci = { - .name = "pwm-lpss", - .id_table = pwm_lpss_pci_ids, - .probe = pwm_lpss_probe_pci, - .remove = pwm_lpss_remove_pci, -}; - -static int pwm_lpss_probe_platform(struct platform_device *pdev) -{ - const struct pwm_lpss_boardinfo *info; - const struct acpi_device_id *id; - struct pwm_lpss_chip *lpwm; - struct resource *r; - - id = acpi_match_device(pdev->dev.driver->acpi_match_table, &pdev->dev); - if (!id) - return -ENODEV; - - r = platform_get_resource(pdev, IORESOURCE_MEM, 0); - - info = (struct pwm_lpss_boardinfo *)id->driver_data; - lpwm = pwm_lpss_probe(&pdev->dev, r, info); - if (IS_ERR(lpwm)) - return PTR_ERR(lpwm); - - platform_set_drvdata(pdev, lpwm); - return 0; -} - -static int pwm_lpss_remove_platform(struct platform_device *pdev) -{ - struct pwm_lpss_chip *lpwm = platform_get_drvdata(pdev); - - return pwm_lpss_remove(lpwm); -} - -static const struct acpi_device_id pwm_lpss_acpi_match[] = { - { "80860F09", (unsigned long)&byt_info }, - { "80862288", (unsigned long)&bsw_info }, - { }, -}; -MODULE_DEVICE_TABLE(acpi, pwm_lpss_acpi_match); - -static struct platform_driver pwm_lpss_driver_platform = { - .driver = { - .name = "pwm-lpss", - .acpi_match_table = pwm_lpss_acpi_match, - }, - .probe = pwm_lpss_probe_platform, - .remove = pwm_lpss_remove_platform, -}; - -static int __init pwm_init(void) -{ - pci_drv = pci_register_driver(&pwm_lpss_driver_pci); - plat_drv = platform_driver_register(&pwm_lpss_driver_platform); - if (pci_drv && plat_drv) - return pci_drv; - - return 0; -} -module_init(pwm_init); - -static void __exit pwm_exit(void) -{ - if (!pci_drv) - pci_unregister_driver(&pwm_lpss_driver_pci); - if (!plat_drv) - platform_driver_unregister(&pwm_lpss_driver_platform); -} -module_exit(pwm_exit); +EXPORT_SYMBOL_GPL(pwm_lpss_remove); MODULE_DESCRIPTION("PWM driver for Intel LPSS"); MODULE_AUTHOR("Mika Westerberg "); MODULE_LICENSE("GPL v2"); -MODULE_ALIAS("platform:pwm-lpss"); diff --git a/drivers/pwm/pwm-lpss.h b/drivers/pwm/pwm-lpss.h new file mode 100644 index 000000000000..aa041bb1b67d --- /dev/null +++ b/drivers/pwm/pwm-lpss.h @@ -0,0 +1,32 @@ +/* + * Intel Low Power Subsystem PWM controller driver + * + * Copyright (C) 2014, Intel Corporation + * + * Derived from the original pwm-lpss.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __PWM_LPSS_H +#define __PWM_LPSS_H + +#include +#include + +struct pwm_lpss_chip; + +struct pwm_lpss_boardinfo { + unsigned long clk_rate; +}; + +extern const struct pwm_lpss_boardinfo pwm_lpss_byt_info; +extern const struct pwm_lpss_boardinfo pwm_lpss_bsw_info; + +struct pwm_lpss_chip *pwm_lpss_probe(struct device *dev, struct resource *r, + const struct pwm_lpss_boardinfo *info); +int pwm_lpss_remove(struct pwm_lpss_chip *lpwm); + +#endif /* __PWM_LPSS_H */ -- cgit v1.2.3 From 90927fe9a001340304e0c37dee578e4432b1744e Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 19 Aug 2014 19:17:36 +0300 Subject: pwm: lpss: pci: Move to use pcim_enable_device() Let's use managed functions for this driver. Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lpss-pci.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-lpss-pci.c b/drivers/pwm/pwm-lpss-pci.c index 1bfdd89c329c..cf20d2beacdd 100644 --- a/drivers/pwm/pwm-lpss-pci.c +++ b/drivers/pwm/pwm-lpss-pci.c @@ -23,7 +23,7 @@ static int pwm_lpss_probe_pci(struct pci_dev *pdev, struct pwm_lpss_chip *lpwm; int err; - err = pci_enable_device(pdev); + err = pcim_enable_device(pdev); if (err < 0) return err; @@ -41,7 +41,6 @@ static void pwm_lpss_remove_pci(struct pci_dev *pdev) struct pwm_lpss_chip *lpwm = pci_get_drvdata(pdev); pwm_lpss_remove(lpwm); - pci_disable_device(pdev); } static const struct pci_device_id pwm_lpss_pci_ids[] = { -- cgit v1.2.3 From e0c86a3b63e948e51a47d17382c7cd8711d19750 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Sat, 23 Aug 2014 00:22:45 +0200 Subject: pwm: lpss: Fix build failure on PowerPC An x86 build seems to pull in the linux/io.h include indirectly. On PowerPC that doesn't happen and the build breaks due to the readl() and writel() functions not being declared. Fix this by explicitly including linux/io.h. Reported-by: Stephen Rothwell Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lpss.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c index ce9bf147811f..7a7a934a7757 100644 --- a/drivers/pwm/pwm-lpss.c +++ b/drivers/pwm/pwm-lpss.c @@ -13,6 +13,7 @@ * published by the Free Software Foundation. */ +#include #include #include -- cgit v1.2.3 From b2b7adeb21745266326d453b95e5d0b1b9cb1d4e Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 23 Aug 2014 13:20:25 +0200 Subject: pwm: lpss: use c99 initializers in structures Use c99 initializers for structures. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @decl@ identifier i1,fld; type T; field list[n] fs; @@ struct i1 { fs T fld; ...}; @bad@ identifier decl.i1,i2; expression e; initializer list[decl.n] is; @@ struct i1 i2 = { is, + .fld = e - e ,...}; // Signed-off-by: Julia Lawall [thierry.reding: rebased and applied same fix for Braswell] Signed-off-by: Thierry Reding --- drivers/pwm/pwm-lpss.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-lpss.c b/drivers/pwm/pwm-lpss.c index 7a7a934a7757..e9798253a16f 100644 --- a/drivers/pwm/pwm-lpss.c +++ b/drivers/pwm/pwm-lpss.c @@ -37,13 +37,13 @@ struct pwm_lpss_chip { /* BayTrail */ const struct pwm_lpss_boardinfo pwm_lpss_byt_info = { - 25000000 + .clk_rate = 25000000 }; EXPORT_SYMBOL_GPL(pwm_lpss_byt_info); /* Braswell */ const struct pwm_lpss_boardinfo pwm_lpss_bsw_info = { - 19200000 + .clk_rate = 19200000 }; EXPORT_SYMBOL_GPL(pwm_lpss_bsw_info); -- cgit v1.2.3 From ad16202de8d884c10ef7637ea3982953519c2418 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 25 Aug 2014 12:38:31 +0300 Subject: pwm: lpss: make it buildable only on X86 There is no sign of this IP block on non-x86 architectures and rather will not be. Thus, make this explicit by applying a direct dependency to X86. Signed-off-by: Andy Shevchenko Signed-off-by: Thierry Reding --- drivers/pwm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index 090741635f71..3865dfb9ed08 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -150,6 +150,7 @@ config PWM_LPC32XX config PWM_LPSS tristate "Intel LPSS PWM support" + depends on X86 help Generic PWM framework driver for Intel Low Power Subsystem PWM controller. -- cgit v1.2.3 From 533acc0e8df7d6553f11cf91c177211cb6037968 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 18 Aug 2014 17:08:44 +0800 Subject: pwm: Fix possible ZERO_SIZE_PTR pointer dereferencing error. Since we cannot make sure the 'chip->npwm' will always be none zero here, and then if either equal to zero, the kzalloc() will return ZERO_SIZE_PTR, which equals to ((void *)16). So this patch fix this with just doing the zero check before calling kzalloc(). Signed-off-by: Xiubo Li Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index d2c35920ff08..8c748b17f416 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -236,7 +236,7 @@ int pwmchip_add(struct pwm_chip *chip) int ret; if (!chip || !chip->dev || !chip->ops || !chip->ops->config || - !chip->ops->enable || !chip->ops->disable) + !chip->ops->enable || !chip->ops->disable || !chip->npwm) return -EINVAL; mutex_lock(&pwm_lock); -- cgit v1.2.3 From bd59bdc898623e6c948a9f900250ce7343cf9012 Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Wed, 28 May 2014 18:50:11 +0800 Subject: pwm: imx: Fix the macro MX3_PWMCR_PRESCALER(x) definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds missing parentheses around the argument of the macro MX3_PWMCR_PRESCALER(x) to avoid any potential macro expansion issue. Reported-by: Lothar Waßmann Cc: Thierry Reding Cc: Sascha Hauer Cc: Shawn Guo Cc: Lothar Waßmann Cc: linux-pwm@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Liu Ying Acked-by: Shawn Guo Signed-off-by: Thierry Reding --- drivers/pwm/pwm-imx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pwm/pwm-imx.c b/drivers/pwm/pwm-imx.c index 5449d9150d40..183225e41d60 100644 --- a/drivers/pwm/pwm-imx.c +++ b/drivers/pwm/pwm-imx.c @@ -32,7 +32,7 @@ #define MX3_PWMCR 0x00 /* PWM Control Register */ #define MX3_PWMSAR 0x0C /* PWM Sample Register */ #define MX3_PWMPR 0x10 /* PWM Period Register */ -#define MX3_PWMCR_PRESCALER(x) (((x - 1) & 0xFFF) << 4) +#define MX3_PWMCR_PRESCALER(x) ((((x) - 1) & 0xFFF) << 4) #define MX3_PWMCR_DOZEEN (1 << 24) #define MX3_PWMCR_WAITEN (1 << 23) #define MX3_PWMCR_DBGEN (1 << 22) -- cgit v1.2.3 From 40f260c2cebb464dda6916055112963f1421a111 Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Wed, 28 May 2014 18:50:12 +0800 Subject: pwm: imx: Cleanup indentation for register definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch contains no logic change to cleanup indentation for register definitions only. Cc: Thierry Reding Cc: Sascha Hauer Cc: Shawn Guo Cc: Lothar Waßmann Cc: linux-pwm@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Liu Ying Acked-by: Shawn Guo Signed-off-by: Thierry Reding --- drivers/pwm/pwm-imx.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/pwm/pwm-imx.c b/drivers/pwm/pwm-imx.c index 183225e41d60..fb68534b098c 100644 --- a/drivers/pwm/pwm-imx.c +++ b/drivers/pwm/pwm-imx.c @@ -21,24 +21,24 @@ /* i.MX1 and i.MX21 share the same PWM function block: */ -#define MX1_PWMC 0x00 /* PWM Control Register */ -#define MX1_PWMS 0x04 /* PWM Sample Register */ -#define MX1_PWMP 0x08 /* PWM Period Register */ +#define MX1_PWMC 0x00 /* PWM Control Register */ +#define MX1_PWMS 0x04 /* PWM Sample Register */ +#define MX1_PWMP 0x08 /* PWM Period Register */ -#define MX1_PWMC_EN (1 << 4) +#define MX1_PWMC_EN (1 << 4) /* i.MX27, i.MX31, i.MX35 share the same PWM function block: */ -#define MX3_PWMCR 0x00 /* PWM Control Register */ -#define MX3_PWMSAR 0x0C /* PWM Sample Register */ -#define MX3_PWMPR 0x10 /* PWM Period Register */ -#define MX3_PWMCR_PRESCALER(x) ((((x) - 1) & 0xFFF) << 4) -#define MX3_PWMCR_DOZEEN (1 << 24) -#define MX3_PWMCR_WAITEN (1 << 23) +#define MX3_PWMCR 0x00 /* PWM Control Register */ +#define MX3_PWMSAR 0x0C /* PWM Sample Register */ +#define MX3_PWMPR 0x10 /* PWM Period Register */ +#define MX3_PWMCR_PRESCALER(x) ((((x) - 1) & 0xFFF) << 4) +#define MX3_PWMCR_DOZEEN (1 << 24) +#define MX3_PWMCR_WAITEN (1 << 23) #define MX3_PWMCR_DBGEN (1 << 22) -#define MX3_PWMCR_CLKSRC_IPG_HIGH (2 << 16) -#define MX3_PWMCR_CLKSRC_IPG (1 << 16) -#define MX3_PWMCR_EN (1 << 0) +#define MX3_PWMCR_CLKSRC_IPG_HIGH (2 << 16) +#define MX3_PWMCR_CLKSRC_IPG (1 << 16) +#define MX3_PWMCR_EN (1 << 0) struct imx_chip { struct clk *clk_per; -- cgit v1.2.3 From 137fd45ffec15db14034990ceac890975cae7a32 Mon Sep 17 00:00:00 2001 From: Liu Ying Date: Wed, 28 May 2014 18:50:13 +0800 Subject: pwm: imx: Avoid sample FIFO overflow for i.MX PWM version2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The i.MX PWM version2 is embedded in several i.MX SoCs, such as i.MX27, i.MX51 and i.MX6SL. There is a 4-word (16 bit) sample FIFO in this IP. Each FIFO slot determines the duty period of a PWM waveform in one full cycle. The IP spec mentions that we should not write a fourth sample because the FIFO will become full and triggers a FIFO write error (FWE) which will prevent the PWM from starting once it is enabled. In order to avoid any sample FIFO overflow issue, this patch clears all sample FIFO by doing software reset in the configuration hook when the controller is disabled or waits for a full PWM cycle to get a relinquished FIFO slot when the controller is enabled and the FIFO is fully loaded. The FIFO overflow issue can be reproduced by the following commands on the i.MX6SL EVK platform, assuming we use PWM2 for the debug LED which is driven by the pin HSIC_STROBE and the maximal brightness is 255. echo 0 > /sys/class/leds/user/brightness echo 0 > /sys/class/leds/user/brightness echo 0 > /sys/class/leds/user/brightness echo 0 > /sys/class/leds/user/brightness echo 255 > /sys/class/leds/user/brightness Here, FWE happens (PWMSR register reads 0x58) and the LED can not be lighten. Another way to reproduce the FIFO overflow issue is to run this script: while true; do echo 255 > /sys/class/leds/user/brightness; done Cc: Thierry Reding Cc: Sascha Hauer Cc: Shawn Guo Cc: Lothar Waßmann Cc: linux-pwm@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Signed-off-by: Liu Ying Acked-by: Shawn Guo Signed-off-by: Thierry Reding --- drivers/pwm/pwm-imx.c | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/drivers/pwm/pwm-imx.c b/drivers/pwm/pwm-imx.c index fb68534b098c..f8b5f109c1ab 100644 --- a/drivers/pwm/pwm-imx.c +++ b/drivers/pwm/pwm-imx.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,7 @@ /* i.MX27, i.MX31, i.MX35 share the same PWM function block: */ #define MX3_PWMCR 0x00 /* PWM Control Register */ +#define MX3_PWMSR 0x04 /* PWM Status Register */ #define MX3_PWMSAR 0x0C /* PWM Sample Register */ #define MX3_PWMPR 0x10 /* PWM Period Register */ #define MX3_PWMCR_PRESCALER(x) ((((x) - 1) & 0xFFF) << 4) @@ -38,7 +40,12 @@ #define MX3_PWMCR_DBGEN (1 << 22) #define MX3_PWMCR_CLKSRC_IPG_HIGH (2 << 16) #define MX3_PWMCR_CLKSRC_IPG (1 << 16) +#define MX3_PWMCR_SWR (1 << 3) #define MX3_PWMCR_EN (1 << 0) +#define MX3_PWMSR_FIFOAV_4WORDS 0x4 +#define MX3_PWMSR_FIFOAV_MASK 0x7 + +#define MX3_PWM_SWR_LOOP 5 struct imx_chip { struct clk *clk_per; @@ -103,9 +110,43 @@ static int imx_pwm_config_v2(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct imx_chip *imx = to_imx_chip(chip); + struct device *dev = chip->dev; unsigned long long c; unsigned long period_cycles, duty_cycles, prescale; - u32 cr; + unsigned int period_ms; + bool enable = test_bit(PWMF_ENABLED, &pwm->flags); + int wait_count = 0, fifoav; + u32 cr, sr; + + /* + * i.MX PWMv2 has a 4-word sample FIFO. + * In order to avoid FIFO overflow issue, we do software reset + * to clear all sample FIFO if the controller is disabled or + * wait for a full PWM cycle to get a relinquished FIFO slot + * when the controller is enabled and the FIFO is fully loaded. + */ + if (enable) { + sr = readl(imx->mmio_base + MX3_PWMSR); + fifoav = sr & MX3_PWMSR_FIFOAV_MASK; + if (fifoav == MX3_PWMSR_FIFOAV_4WORDS) { + period_ms = DIV_ROUND_UP(pwm->period, NSEC_PER_MSEC); + msleep(period_ms); + + sr = readl(imx->mmio_base + MX3_PWMSR); + if (fifoav == (sr & MX3_PWMSR_FIFOAV_MASK)) + dev_warn(dev, "there is no free FIFO slot\n"); + } + } else { + writel(MX3_PWMCR_SWR, imx->mmio_base + MX3_PWMCR); + do { + usleep_range(200, 1000); + cr = readl(imx->mmio_base + MX3_PWMCR); + } while ((cr & MX3_PWMCR_SWR) && + (wait_count++ < MX3_PWM_SWR_LOOP)); + + if (cr & MX3_PWMCR_SWR) + dev_warn(dev, "software reset timeout\n"); + } c = clk_get_rate(imx->clk_per); c = c * period_ns; @@ -135,7 +176,7 @@ static int imx_pwm_config_v2(struct pwm_chip *chip, MX3_PWMCR_DOZEEN | MX3_PWMCR_WAITEN | MX3_PWMCR_DBGEN | MX3_PWMCR_CLKSRC_IPG_HIGH; - if (test_bit(PWMF_ENABLED, &pwm->flags)) + if (enable) cr |= MX3_PWMCR_EN; writel(cr, imx->mmio_base + MX3_PWMCR); -- cgit v1.2.3 From 7264354c0cb8c04bd4a85d24e5d57a0e2417c2fb Mon Sep 17 00:00:00 2001 From: Doug Anderson Date: Mon, 25 Aug 2014 15:59:25 -0700 Subject: pwm: rockchip: Allow polarity invert on rk3288 The rk3288 has the ability to invert the polarity of the PWM. Let's enable that ability. Note that this increases pwm_cells to 3 for rk3288. Signed-off-by: Doug Anderson Reviewed-by: Caesar Wang Signed-off-by: Thierry Reding --- .../devicetree/bindings/pwm/pwm-rockchip.txt | 4 +- drivers/pwm/pwm-rockchip.c | 57 ++++++++++++++++++---- 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt index d47d15a6a298..b8be3d09ee26 100644 --- a/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt +++ b/Documentation/devicetree/bindings/pwm/pwm-rockchip.txt @@ -7,8 +7,8 @@ Required properties: "rockchip,vop-pwm": found integrated in VOP on RK3288 SoC - reg: physical base address and length of the controller's registers - clocks: phandle and clock specifier of the PWM reference clock - - #pwm-cells: should be 2. See pwm.txt in this directory for a - description of the cell format. + - #pwm-cells: must be 2 (rk2928) or 3 (rk3288). See pwm.txt in this directory + for a description of the cell format. Example: diff --git a/drivers/pwm/pwm-rockchip.c b/drivers/pwm/pwm-rockchip.c index bdd8644c01cf..9442df244101 100644 --- a/drivers/pwm/pwm-rockchip.c +++ b/drivers/pwm/pwm-rockchip.c @@ -24,7 +24,9 @@ #define PWM_ENABLE (1 << 0) #define PWM_CONTINUOUS (1 << 1) #define PWM_DUTY_POSITIVE (1 << 3) +#define PWM_DUTY_NEGATIVE (0 << 3) #define PWM_INACTIVE_NEGATIVE (0 << 4) +#define PWM_INACTIVE_POSITIVE (1 << 4) #define PWM_OUTPUT_LEFT (0 << 5) #define PWM_LP_DISABLE (0 << 8) @@ -45,8 +47,10 @@ struct rockchip_pwm_regs { struct rockchip_pwm_data { struct rockchip_pwm_regs regs; unsigned int prescaler; + const struct pwm_ops *ops; - void (*set_enable)(struct pwm_chip *chip, bool enable); + void (*set_enable)(struct pwm_chip *chip, + struct pwm_device *pwm, bool enable); }; static inline struct rockchip_pwm_chip *to_rockchip_pwm_chip(struct pwm_chip *c) @@ -54,7 +58,8 @@ static inline struct rockchip_pwm_chip *to_rockchip_pwm_chip(struct pwm_chip *c) return container_of(c, struct rockchip_pwm_chip, chip); } -static void rockchip_pwm_set_enable_v1(struct pwm_chip *chip, bool enable) +static void rockchip_pwm_set_enable_v1(struct pwm_chip *chip, + struct pwm_device *pwm, bool enable) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); u32 enable_conf = PWM_CTRL_OUTPUT_EN | PWM_CTRL_TIMER_EN; @@ -70,14 +75,19 @@ static void rockchip_pwm_set_enable_v1(struct pwm_chip *chip, bool enable) writel_relaxed(val, pc->base + pc->data->regs.ctrl); } -static void rockchip_pwm_set_enable_v2(struct pwm_chip *chip, bool enable) +static void rockchip_pwm_set_enable_v2(struct pwm_chip *chip, + struct pwm_device *pwm, bool enable) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); u32 enable_conf = PWM_OUTPUT_LEFT | PWM_LP_DISABLE | PWM_ENABLE | - PWM_CONTINUOUS | PWM_DUTY_POSITIVE | - PWM_INACTIVE_NEGATIVE; + PWM_CONTINUOUS; u32 val; + if (pwm->polarity == PWM_POLARITY_INVERSED) + enable_conf |= PWM_DUTY_NEGATIVE | PWM_INACTIVE_POSITIVE; + else + enable_conf |= PWM_DUTY_POSITIVE | PWM_INACTIVE_NEGATIVE; + val = readl_relaxed(pc->base + pc->data->regs.ctrl); if (enable) @@ -124,6 +134,19 @@ static int rockchip_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return 0; } +static int rockchip_pwm_set_polarity(struct pwm_chip *chip, + struct pwm_device *pwm, + enum pwm_polarity polarity) +{ + /* + * No action needed here because pwm->polarity will be set by the core + * and the core will only change polarity when the PWM is not enabled. + * We'll handle things in set_enable(). + */ + + return 0; +} + static int rockchip_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); @@ -133,7 +156,7 @@ static int rockchip_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm) if (ret) return ret; - pc->data->set_enable(chip, true); + pc->data->set_enable(chip, pwm, true); return 0; } @@ -142,18 +165,26 @@ static void rockchip_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm) { struct rockchip_pwm_chip *pc = to_rockchip_pwm_chip(chip); - pc->data->set_enable(chip, false); + pc->data->set_enable(chip, pwm, false); clk_disable(pc->clk); } -static const struct pwm_ops rockchip_pwm_ops = { +static const struct pwm_ops rockchip_pwm_ops_v1 = { .config = rockchip_pwm_config, .enable = rockchip_pwm_enable, .disable = rockchip_pwm_disable, .owner = THIS_MODULE, }; +static const struct pwm_ops rockchip_pwm_ops_v2 = { + .config = rockchip_pwm_config, + .set_polarity = rockchip_pwm_set_polarity, + .enable = rockchip_pwm_enable, + .disable = rockchip_pwm_disable, + .owner = THIS_MODULE, +}; + static const struct rockchip_pwm_data pwm_data_v1 = { .regs = { .duty = 0x04, @@ -162,6 +193,7 @@ static const struct rockchip_pwm_data pwm_data_v1 = { .ctrl = 0x0c, }, .prescaler = 2, + .ops = &rockchip_pwm_ops_v1, .set_enable = rockchip_pwm_set_enable_v1, }; @@ -173,6 +205,7 @@ static const struct rockchip_pwm_data pwm_data_v2 = { .ctrl = 0x0c, }, .prescaler = 1, + .ops = &rockchip_pwm_ops_v2, .set_enable = rockchip_pwm_set_enable_v2, }; @@ -184,6 +217,7 @@ static const struct rockchip_pwm_data pwm_data_vop = { .ctrl = 0x00, }, .prescaler = 1, + .ops = &rockchip_pwm_ops_v2, .set_enable = rockchip_pwm_set_enable_v2, }; @@ -227,10 +261,15 @@ static int rockchip_pwm_probe(struct platform_device *pdev) pc->data = id->data; pc->chip.dev = &pdev->dev; - pc->chip.ops = &rockchip_pwm_ops; + pc->chip.ops = pc->data->ops; pc->chip.base = -1; pc->chip.npwm = 1; + if (pc->data->ops->set_polarity) { + pc->chip.of_xlate = of_pwm_xlate_with_flags; + pc->chip.of_pwm_n_cells = 3; + } + ret = pwmchip_add(&pc->chip); if (ret < 0) { clk_unprepare(pc->clk); -- cgit v1.2.3 From e4dbf98f7f169346f57296e173e883b7330076ab Mon Sep 17 00:00:00 2001 From: Peter Feuerer Date: Tue, 22 Jul 2014 17:37:13 +0200 Subject: thermal: Added Bang-bang thermal governor The bang-bang thermal governor uses a hysteresis to switch abruptly on or off a cooling device. It is intended to control fans, which can not be throttled but just switched on or off. Bang-bang cannot be set as default governor as it is intended for special devices only. For those special devices the driver needs to explicitely request it. Cc: Andrew Morton Cc: Zhang Rui Cc: Andreas Mohr Cc: Borislav Petkov Cc: Javi Merino Cc: linux-pm@vger.kernel.org Signed-off-by: Peter Feuerer Signed-off-by: Zhang Rui --- drivers/thermal/Kconfig | 10 +++ drivers/thermal/Makefile | 1 + drivers/thermal/gov_bang_bang.c | 131 ++++++++++++++++++++++++++++++++++++++++ drivers/thermal/thermal_core.c | 5 ++ drivers/thermal/thermal_core.h | 8 +++ 5 files changed, 155 insertions(+) create mode 100644 drivers/thermal/gov_bang_bang.c diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 693208eb9047..2500ecc48260 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -84,6 +84,16 @@ config THERMAL_GOV_STEP_WISE Enable this to manage platform thermals using a simple linear governor. +config THERMAL_GOV_BANG_BANG + bool "Bang Bang thermal governor" + default n + help + Enable this to manage platform thermals using bang bang governor. + + Say 'Y' here if you want to use two point temperature regulation + used for fans without throttling. Some fan drivers depend on this + governor to be enabled (e.g. acerhdf). + config THERMAL_GOV_USER_SPACE bool "User_space thermal governor" help diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 31e232f84b6b..b7e65423d7b6 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -11,6 +11,7 @@ thermal_sys-$(CONFIG_THERMAL_OF) += of-thermal.o # governors thermal_sys-$(CONFIG_THERMAL_GOV_FAIR_SHARE) += fair_share.o +thermal_sys-$(CONFIG_THERMAL_GOV_BANG_BANG) += gov_bang_bang.o thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE) += step_wise.o thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE) += user_space.o diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c new file mode 100644 index 000000000000..c5dd76b2ee74 --- /dev/null +++ b/drivers/thermal/gov_bang_bang.c @@ -0,0 +1,131 @@ +/* + * gov_bang_bang.c - A simple thermal throttling governor using hysteresis + * + * Copyright (C) 2014 Peter Feuerer + * + * Based on step_wise.c with following Copyrights: + * Copyright (C) 2012 Intel Corp + * Copyright (C) 2012 Durgadoss R + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + */ + +#include + +#include "thermal_core.h" + +static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip) +{ + long trip_temp; + unsigned long trip_hyst; + struct thermal_instance *instance; + + tz->ops->get_trip_temp(tz, trip, &trip_temp); + tz->ops->get_trip_hyst(tz, trip, &trip_hyst); + + dev_dbg(&tz->device, "Trip%d[temp=%ld]:temp=%d:hyst=%ld\n", + trip, trip_temp, tz->temperature, + trip_hyst); + + mutex_lock(&tz->lock); + + list_for_each_entry(instance, &tz->thermal_instances, tz_node) { + if (instance->trip != trip) + continue; + + /* in case fan is in initial state, switch the fan off */ + if (instance->target == THERMAL_NO_TARGET) + instance->target = 0; + + /* in case fan is neither on nor off set the fan to active */ + if (instance->target != 0 && instance->target != 1) { + pr_warn("Thermal instance %s controlled by bang-bang has unexpected state: %ld\n", + instance->name, instance->target); + instance->target = 1; + } + + /* + * enable fan when temperature exceeds trip_temp and disable + * the fan in case it falls below trip_temp minus hysteresis + */ + if (instance->target == 0 && tz->temperature >= trip_temp) + instance->target = 1; + else if (instance->target == 1 && + tz->temperature < trip_temp - trip_hyst) + instance->target = 0; + + dev_dbg(&instance->cdev->device, "target=%d\n", + (int)instance->target); + + instance->cdev->updated = false; /* cdev needs update */ + } + + mutex_unlock(&tz->lock); +} + +/** + * bang_bang_control - controls devices associated with the given zone + * @tz - thermal_zone_device + * @trip - the trip point + * + * Regulation Logic: a two point regulation, deliver cooling state depending + * on the previous state shown in this diagram: + * + * Fan: OFF ON + * + * | + * | + * trip_temp: +---->+ + * | | ^ + * | | | + * | | Temperature + * (trip_temp - hyst): +<----+ + * | + * | + * | + * + * * If the fan is not running and temperature exceeds trip_temp, the fan + * gets turned on. + * * In case the fan is running, temperature must fall below + * (trip_temp - hyst) so that the fan gets turned off again. + * + */ +static int bang_bang_control(struct thermal_zone_device *tz, int trip) +{ + struct thermal_instance *instance; + + thermal_zone_trip_update(tz, trip); + + mutex_lock(&tz->lock); + + list_for_each_entry(instance, &tz->thermal_instances, tz_node) + thermal_cdev_update(instance->cdev); + + mutex_unlock(&tz->lock); + + return 0; +} + +static struct thermal_governor thermal_gov_bang_bang = { + .name = "bang_bang", + .throttle = bang_bang_control, +}; + +int thermal_gov_bang_bang_register(void) +{ + return thermal_register_governor(&thermal_gov_bang_bang); +} + +void thermal_gov_bang_bang_unregister(void) +{ + thermal_unregister_governor(&thermal_gov_bang_bang); +} diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index 71b0ec0c370d..4c2726b55a2b 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -1790,6 +1790,10 @@ static int __init thermal_register_governors(void) if (result) return result; + result = thermal_gov_bang_bang_register(); + if (result) + return result; + return thermal_gov_user_space_register(); } @@ -1797,6 +1801,7 @@ static void thermal_unregister_governors(void) { thermal_gov_step_wise_unregister(); thermal_gov_fair_share_unregister(); + thermal_gov_bang_bang_unregister(); thermal_gov_user_space_unregister(); } diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index 3db339fb636f..d15d243de27a 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -69,6 +69,14 @@ static inline int thermal_gov_fair_share_register(void) { return 0; } static inline void thermal_gov_fair_share_unregister(void) {} #endif /* CONFIG_THERMAL_GOV_FAIR_SHARE */ +#ifdef CONFIG_THERMAL_GOV_BANG_BANG +int thermal_gov_bang_bang_register(void); +void thermal_gov_bang_bang_unregister(void); +#else +static inline int thermal_gov_bang_bang_register(void) { return 0; } +static inline void thermal_gov_bang_bang_unregister(void) {} +#endif /* CONFIG_THERMAL_GOV_BANG_BANG */ + #ifdef CONFIG_THERMAL_GOV_USER_SPACE int thermal_gov_user_space_register(void); void thermal_gov_user_space_unregister(void); -- cgit v1.2.3 From 70145f87139fbc43b726f873813cd91dce371899 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 28 Aug 2014 11:03:14 +0200 Subject: pwm: Fix uninitialized warnings in pwm_get() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With some versions of gcc (e.g. 4.1.2): drivers/pwm/core.c: In function ‘pwm_get’: drivers/pwm/core.c:610: warning: ‘polarity’ may be used uninitialized in this function drivers/pwm/core.c:609: warning: ‘period’ may be used uninitialized in this function While these are false positives, we can get rid of them by refactoring the code to store a pointer to the best match, as suggested before by Thierry Reding. This does require moving the mutex_unlock() down. Fixes: d717ea73e36dd565 ("pwm: Fix period and polarity in pwm_get() for non-perfect matches") Signed-off-by: Geert Uytterhoeven Signed-off-by: Thierry Reding --- drivers/pwm/core.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c index 8c748b17f416..966497d10c6e 100644 --- a/drivers/pwm/core.c +++ b/drivers/pwm/core.c @@ -602,12 +602,9 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id) struct pwm_device *pwm = ERR_PTR(-EPROBE_DEFER); const char *dev_id = dev ? dev_name(dev) : NULL; struct pwm_chip *chip = NULL; - unsigned int index = 0; unsigned int best = 0; - struct pwm_lookup *p; + struct pwm_lookup *p, *chosen = NULL; unsigned int match; - unsigned int period; - enum pwm_polarity polarity; /* look up via DT first */ if (IS_ENABLED(CONFIG_OF) && dev && dev->of_node) @@ -653,10 +650,7 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id) } if (match > best) { - chip = pwmchip_find_by_name(p->provider); - index = p->index; - period = p->period; - polarity = p->polarity; + chosen = p; if (match != 3) best = match; @@ -665,17 +659,22 @@ struct pwm_device *pwm_get(struct device *dev, const char *con_id) } } - mutex_unlock(&pwm_lookup_lock); + if (!chosen) + goto out; - if (chip) - pwm = pwm_request_from_chip(chip, index, con_id ?: dev_id); - if (IS_ERR(pwm)) - return pwm; + chip = pwmchip_find_by_name(chosen->provider); + if (!chip) + goto out; - pwm_set_period(pwm, period); - pwm_set_polarity(pwm, polarity); + pwm = pwm_request_from_chip(chip, chosen->index, con_id ?: dev_id); + if (IS_ERR(pwm)) + goto out; + pwm_set_period(pwm, chosen->period); + pwm_set_polarity(pwm, chosen->polarity); +out: + mutex_unlock(&pwm_lookup_lock); return pwm; } EXPORT_SYMBOL_GPL(pwm_get); -- cgit v1.2.3 From 537d8f93805ace30ce097736d3aac041931274b1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 29 Aug 2014 20:49:51 -0400 Subject: ext4: convert ext4_dx_find_entry() to use the ERR_PTR convention Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 46 ++++++++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 90a3cdca3f88..1421ec1cd7e4 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -270,8 +270,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *err); + struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); @@ -1258,17 +1257,13 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto restart; } if (is_dx(dir)) { - bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); + bh = ext4_dx_find_entry(dir, d_name, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (err == -ENOENT) - return NULL; - if (err && err != ERR_BAD_DX_DIR) - return ERR_PTR(err); - if (bh) + if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR) return bh; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); @@ -1366,34 +1361,32 @@ cleanup_and_exit: } static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, int *err) + struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; struct dx_hash_info hinfo; struct dx_frame frames[2], *frame; struct buffer_head *bh; ext4_lblk_t block; - int retval; + int err = 0, retval; - if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) - return NULL; + frame = dx_probe(d_name, dir, &hinfo, frames, &err); + if (err) + return ERR_PTR(err); do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); + if (IS_ERR(bh)) goto errout; - } + retval = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); - if (retval == 1) { /* Success! */ - dx_release(frames); - return bh; - } + if (retval == 1) + goto success; brelse(bh); if (retval == -1) { - *err = ERR_BAD_DX_DIR; + bh = ERR_PTR(ERR_BAD_DX_DIR); goto errout; } @@ -1402,18 +1395,19 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q frames, NULL); if (retval < 0) { ext4_warning(sb, - "error reading index page in directory #%lu", - dir->i_ino); - *err = retval; + "error %d reading index page in directory #%lu", + retval, dir->i_ino); + bh = ERR_PTR(retval); goto errout; } } while (retval == 1); - *err = -ENOENT; + bh = NULL; errout: dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); - dx_release (frames); - return NULL; +success: + dx_release(frames); + return bh; } static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) -- cgit v1.2.3 From 1056008226769fe982236c26038a095aeb47714b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 29 Aug 2014 20:51:32 -0400 Subject: ext4: convert ext4_getblk() to use the ERR_PTR convention Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 +-- fs/ext4/inode.c | 51 +++++++++++++++++++++++++-------------------------- fs/ext4/namei.c | 9 ++++----- 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b0c225cdb52c..8009077079e4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2086,8 +2086,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); /* inode.c */ -struct buffer_head *ext4_getblk(handle_t *, struct inode *, - ext4_lblk_t, int, int *); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); int ext4_get_block_write(struct inode *inode, sector_t iblock, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3aa26e9117c4..0dfc1cd1eb52 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -734,11 +734,11 @@ int ext4_get_block(struct inode *inode, sector_t iblock, * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create, int *errp) + ext4_lblk_t block, int create) { struct ext4_map_blocks map; struct buffer_head *bh; - int fatal = 0, err; + int err; J_ASSERT(handle != NULL || create == 0); @@ -747,21 +747,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, err = ext4_map_blocks(handle, inode, &map, create ? EXT4_GET_BLOCKS_CREATE : 0); - /* ensure we send some value back into *errp */ - *errp = 0; - - if (create && err == 0) - err = -ENOSPC; /* should never happen */ + if (err == 0) + return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) - *errp = err; - if (err <= 0) - return NULL; + return ERR_PTR(err); bh = sb_getblk(inode->i_sb, map.m_pblk); - if (unlikely(!bh)) { - *errp = -ENOMEM; - return NULL; - } + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { J_ASSERT(create != 0); J_ASSERT(handle != NULL); @@ -775,25 +768,26 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); - fatal = ext4_journal_get_create_access(handle, bh); - if (!fatal && !buffer_uptodate(bh)) { + err = ext4_journal_get_create_access(handle, bh); + if (unlikely(err)) { + unlock_buffer(bh); + goto errout; + } + if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); - if (!fatal) - fatal = err; - } else { + if (unlikely(err)) + goto errout; + } else BUFFER_TRACE(bh, "not a new buffer"); - } - if (fatal) { - *errp = fatal; - brelse(bh); - bh = NULL; - } return bh; +errout: + brelse(bh); + return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, @@ -801,7 +795,12 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, { struct buffer_head *bh; - bh = ext4_getblk(handle, inode, block, create, err); + *err = 0; + bh = ext4_getblk(handle, inode, block, create); + if (IS_ERR(bh)) { + *err = PTR_ERR(bh); + return NULL; + } if (!bh) return bh; if (buffer_uptodate(bh)) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1421ec1cd7e4..26f114b1e4d6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1226,8 +1226,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, err = 0; - int namelen; + int i, namelen; *res_dir = NULL; sb = dir->i_sb; @@ -1293,10 +1292,10 @@ restart: break; } num++; - bh = ext4_getblk(NULL, dir, b++, 0, &err); - if (unlikely(err)) { + bh = ext4_getblk(NULL, dir, b++, 0); + if (unlikely(IS_ERR(bh))) { if (ra_max == 0) - return ERR_PTR(err); + return bh; break; } bh_use[ra_max] = bh; -- cgit v1.2.3 From 1c2150283cae895526d0db3953d13d139f4e7a03 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 29 Aug 2014 20:52:15 -0400 Subject: ext4: convert ext4_bread() to use the ERR_PTR convention Signed-off-by: Theodore Ts'o --- fs/ext4/dir.c | 8 +++----- fs/ext4/ext4.h | 3 +-- fs/ext4/inode.c | 14 ++++---------- fs/ext4/namei.c | 34 ++++++++++++++++++---------------- fs/ext4/super.c | 18 ++++++++---------- 5 files changed, 34 insertions(+), 43 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 0bb3f9ea0832..c24143ea9c08 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -151,13 +151,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; - bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); + bh = ext4_bread(NULL, inode, map.m_lblk, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); } - /* - * We ignore I/O errors on directories so users have a chance - * of recovering data when there's a bad sector - */ if (!bh) { if (!dir_has_error) { EXT4_ERROR_FILE(file, 0, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8009077079e4..ca53bcece838 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2087,8 +2087,7 @@ extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); /* inode.c */ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); -struct buffer_head *ext4_bread(handle_t *, struct inode *, - ext4_lblk_t, int, int *); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0dfc1cd1eb52..8aa241a000c5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -791,27 +791,21 @@ errout: } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create, int *err) + ext4_lblk_t block, int create) { struct buffer_head *bh; - *err = 0; bh = ext4_getblk(handle, inode, block, create); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); - return NULL; - } - if (!bh) + if (IS_ERR(bh)) return bh; - if (buffer_uptodate(bh)) + if (!bh || buffer_uptodate(bh)) return bh; ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; put_bh(bh); - *err = -EIO; - return NULL; + return ERR_PTR(-EIO); } int ext4_walk_page_buffers(handle_t *handle, diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 26f114b1e4d6..af13c908f617 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -53,7 +53,7 @@ static struct buffer_head *ext4_append(handle_t *handle, ext4_lblk_t *block) { struct buffer_head *bh; - int err = 0; + int err; if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ((inode->i_size >> 10) >= @@ -62,9 +62,9 @@ static struct buffer_head *ext4_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext4_bread(handle, inode, *block, 1, &err); - if (!bh) - return ERR_PTR(err); + bh = ext4_bread(handle, inode, *block, 1); + if (IS_ERR(bh)) + return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; BUFFER_TRACE(bh, "get_write_access"); @@ -94,20 +94,20 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, { struct buffer_head *bh; struct ext4_dir_entry *dirent; - int err = 0, is_dx_block = 0; + int is_dx_block = 0; - bh = ext4_bread(NULL, inode, block, 0, &err); - if (!bh) { - if (err == 0) { - ext4_error_inode(inode, __func__, line, block, - "Directory hole found"); - return ERR_PTR(-EIO); - } + bh = ext4_bread(NULL, inode, block, 0); + if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, __func__, line, - "error reading directory block " - "(ino %lu, block %lu)", inode->i_ino, + "error %ld reading directory block " + "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino, (unsigned long) block); - return ERR_PTR(err); + + return bh; + } + if (!bh) { + ext4_error_inode(inode, __func__, line, block, "Directory hole found"); + return ERR_PTR(-EIO); } dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ @@ -640,7 +640,9 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; struct stats stats; printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); - if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue; + bh = ext4_bread(NULL,dir, block, 0); + if (!bh || IS_ERR(bh)) + continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0b28b36e7915..896e452b739d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5305,7 +5305,6 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err = 0; int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; @@ -5320,9 +5319,9 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, while (toread > 0) { tocopy = sb->s_blocksize - offset < toread ? sb->s_blocksize - offset : toread; - bh = ext4_bread(NULL, inode, blk, 0, &err); - if (err) - return err; + bh = ext4_bread(NULL, inode, blk, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) /* A hole? */ memset(data, 0, tocopy); else @@ -5343,8 +5342,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err = 0; - int offset = off & (sb->s_blocksize - 1); + int err, offset = off & (sb->s_blocksize - 1); struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -5365,14 +5363,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - bh = ext4_bread(handle, inode, blk, 1, &err); + bh = ext4_bread(handle, inode, blk, 1); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, bh); if (err) { brelse(bh); - goto out; + return err; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); @@ -5381,8 +5381,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: - if (err) - return err; if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; -- cgit v1.2.3 From dd73b5d5cb675e2aa3b1d4952e208af1546f91c1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 29 Aug 2014 20:52:17 -0400 Subject: ext4: convert dx_probe() to use the ERR_PTR convention Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 89 +++++++++++++++++++++++---------------------------------- 1 file changed, 35 insertions(+), 54 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index af13c908f617..e6d51655ffcd 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -253,8 +253,7 @@ static unsigned dx_node_limit(struct inode *dir); static struct dx_frame *dx_probe(const struct qstr *d_name, struct inode *dir, struct dx_hash_info *hinfo, - struct dx_frame *frame, - int *err); + struct dx_frame *frame); static void dx_release(struct dx_frame *frames); static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); @@ -670,29 +669,25 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, */ static struct dx_frame * dx_probe(const struct qstr *d_name, struct inode *dir, - struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) + struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect; struct dx_entry *at, *entries, *p, *q, *m; struct dx_root *root; - struct buffer_head *bh; struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; - frame->bh = NULL; - bh = ext4_read_dirblock(dir, 0, INDEX); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); - goto fail; - } - root = (struct dx_root *) bh->b_data; + frame->bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(frame->bh)) + return (struct dx_frame *) frame->bh; + + root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", root->info.hash_version); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } hinfo->hash_version = root->info.hash_version; @@ -706,16 +701,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (root->info.unused_flags & 1) { ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", root->info.unused_flags); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } if ((indirect = root->info.indirect_levels) > 1) { ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", root->info.indirect_levels); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } @@ -725,27 +716,21 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { ext4_warning(dir->i_sb, "dx entry: limit != root limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } dxtrace(printk("Look up %x", hash)); - while (1) - { + while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { ext4_warning(dir->i_sb, "dx entry: no count or count > limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; + goto fail; } p = entries + 1; q = entries + count - 1; - while (p <= q) - { + while (p <= q) { m = p + (q - p)/2; dxtrace(printk(".")); if (dx_get_hash(m) > hash) @@ -754,8 +739,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, p = m + 1; } - if (0) // linear search cross check - { + if (0) { // linear search cross check unsigned n = count - 1; at = entries; while (n--) @@ -772,38 +756,35 @@ dx_probe(const struct qstr *d_name, struct inode *dir, at = p - 1; dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); - frame->bh = bh; frame->entries = entries; frame->at = at; - if (!indirect--) return frame; - bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); - goto fail2; + if (!indirect--) + return frame; + frame++; + frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(frame->bh)) { + ret_err = (struct dx_frame *) frame->bh; + frame->bh = NULL; + goto fail; } - entries = ((struct dx_node *) bh->b_data)->entries; + entries = ((struct dx_node *) frame->bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { ext4_warning(dir->i_sb, "dx entry: limit != node limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; + goto fail; } - frame++; - frame->bh = NULL; } -fail2: +fail: while (frame >= frame_in) { brelse(frame->bh); frame--; } -fail: - if (*err == ERR_BAD_DX_DIR) + if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) ext4_warning(dir->i_sb, "Corrupt dir inode %lu, running e2fsck is " "recommended.", dir->i_ino); - return NULL; + return ret_err; } static void dx_release (struct dx_frame *frames) @@ -989,9 +970,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } hinfo.hash = start_hash; hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir, &hinfo, frames, &err); - if (!frame) - return err; + frame = dx_probe(NULL, dir, &hinfo, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { @@ -1369,11 +1350,11 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q struct dx_frame frames[2], *frame; struct buffer_head *bh; ext4_lblk_t block; - int err = 0, retval; + int retval; - frame = dx_probe(d_name, dir, &hinfo, frames, &err); - if (err) - return ERR_PTR(err); + frame = dx_probe(d_name, dir, &hinfo, frames); + if (IS_ERR(frame)) + return (struct buffer_head *) frame; do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT); @@ -1977,9 +1958,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct ext4_dir_entry_2 *de; int err; - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); - if (!frame) - return err; + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); entries = frame->entries; at = frame->at; bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); -- cgit v1.2.3 From f8b3b59d4d561368cf8c92d50218fc0d5be7cb46 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 29 Aug 2014 20:52:18 -0400 Subject: ext4: convert do_split() to use the ERR_PTR convention Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e6d51655ffcd..dec92b675b35 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1509,7 +1509,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) */ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct buffer_head **bh,struct dx_frame *frame, - struct dx_hash_info *hinfo, int *error) + struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count, continued; @@ -1532,8 +1532,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; - *error = PTR_ERR(bh2); - return NULL; + return (struct ext4_dir_entry_2 *) bh2; } BUFFER_TRACE(*bh, "get_write_access"); @@ -1593,8 +1592,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); /* Which block gets the new entry? */ - if (hinfo->hash >= hash2) - { + if (hinfo->hash >= hash2) { swap(*bh, bh2); de = de2; } @@ -1614,8 +1612,7 @@ journal_error: brelse(bh2); *bh = NULL; ext4_std_error(dir->i_sb, err); - *error = err; - return NULL; + return ERR_PTR(err); } int ext4_find_dest_de(struct inode *dir, struct inode *inode, @@ -1838,8 +1835,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, ext4_handle_dirty_dx_node(handle, dir, frame->bh); ext4_handle_dirty_dirent_node(handle, dir, bh); - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - if (!de) { + de = do_split(handle,dir, &bh, frame, &hinfo); + if (IS_ERR(de)) { /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up @@ -1847,7 +1844,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, */ ext4_mark_inode_dirty(handle, dir); dx_release(frames); - return retval; + return PTR_ERR(de); } dx_release(frames); @@ -2071,9 +2068,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; } } - de = do_split(handle, dir, &bh, frame, &hinfo, &err); - if (!de) + de = do_split(handle, dir, &bh, frame, &hinfo); + if (IS_ERR(de)) { + err = PTR_ERR(de); goto cleanup; + } err = add_dirent_to_buf(handle, dentry, inode, de, bh); goto cleanup; -- cgit v1.2.3 From 52c826db6d4b638677683c79e6c465b99074be74 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Fri, 29 Aug 2014 23:20:44 -0400 Subject: ext4: remove a duplicate call in ext4_init_new_dir() ext4_journal_get_write_access() has just been called in ext4_append() calling it again here is duplicated. Signed-off-by: Wang Shilong Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index dec92b675b35..51705f8c4116 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2378,10 +2378,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext4_journal_get_write_access(handle, dir_block); - if (err) - goto out; de = (struct ext4_dir_entry_2 *)dir_block->b_data; ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); set_nlink(inode, 2); -- cgit v1.2.3 From ee124d2746250786b306952bb8955d3171fa8e69 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 30 Aug 2014 23:34:06 -0400 Subject: ext4: use ext4_update_i_disksize instead of opencoded ones Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8aa241a000c5..cc95dca5cb8a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2651,10 +2651,7 @@ static int ext4_da_write_end(struct file *file, if (copied && new_i_size > EXT4_I(inode)->i_disksize) { if (ext4_has_inline_data(inode) || ext4_da_should_update_i_disksize(page, end)) { - down_write(&EXT4_I(inode)->i_data_sem); - if (new_i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = new_i_size; - up_write(&EXT4_I(inode)->i_data_sem); + ext4_update_i_disksize(inode, new_i_size); /* We need to mark inode dirty even if * new_i_size is less that inode->i_size * bu greater than i_disksize.(hint delalloc) -- cgit v1.2.3 From f8fb4f415034baeed983ca2fb0f51bd74d7370b0 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 30 Aug 2014 23:50:56 -0400 Subject: ext4: use ext4_ext_next_allocated_block instead of mext_next_extent This allows us to make mext_next_extent static and potentially get rid of it. Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 -- fs/ext4/extents.c | 16 +++++++--------- fs/ext4/move_extent.c | 2 +- 3 files changed, 8 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ca53bcece838..420c9be9f7ae 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2753,8 +2753,6 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode, extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); -extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent **extent); /* page-io.c */ extern int __init ext4_init_pageio(void); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74292a71b384..1b768343ddf1 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5304,7 +5304,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; - ext4_lblk_t stop_block, current_block; + ext4_lblk_t stop_block; ext4_lblk_t ex_start, ex_end; /* Let path point to the last extent */ @@ -5365,17 +5365,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, (unsigned long) start); return -EIO; } - - current_block = le32_to_cpu(extent->ee_block); - if (start > current_block) { + if (start > le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ - ret = mext_next_extent(inode, path, &extent); - if (ret != 0) { + if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { + path[depth].p_ext++; + } else { + start = ext4_ext_next_allocated_block(path); ext4_ext_drop_refs(path); kfree(path); - if (ret == 1) - ret = 0; - break; + continue; } } ret = ext4_ext_shift_path_extents(path, shift, inode, diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 671a74b14fd7..123a51b05965 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -76,7 +76,7 @@ copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) * ext4_ext_path structure refers to the last extent, or a negative error * value on failure. */ -int +static int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent **extent) { -- cgit v1.2.3 From fcf6b1b729bcd23f2b49a84fb33ffbb44712ee6a Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sat, 30 Aug 2014 23:52:19 -0400 Subject: ext4: refactor ext4_move_extents code base ext4_move_extents is too complex for review. It has duplicate almost each function available in the rest of other codebase. It has useless artificial restriction orig_offset == donor_offset. But in fact logic of ext4_move_extents is very simple: Iterate extents one by one (similar to ext4_fill_fiemap_extents) ->Iterate each page covered extent (similar to generic_perform_write) ->swap extents for covered by page (can be shared with IOC_MOVE_DATA) Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 + fs/ext4/extents.c | 234 +++++++++++- fs/ext4/move_extent.c | 990 ++++++-------------------------------------------- 3 files changed, 338 insertions(+), 891 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 420c9be9f7ae..cf3ad75d3015 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2740,10 +2740,15 @@ extern int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t lblk_start, ext4_lblk_t lblk_end); extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1b768343ddf1..73d9ae9a16db 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -291,6 +291,19 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) return size; } +static inline int +ext4_force_split_extent_at(handle_t *handle, struct inode *inode, + struct ext4_ext_path *path, ext4_lblk_t lblk, + int nofail) +{ + int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); + + return ext4_split_extent_at(handle, inode, path, lblk, unwritten ? + EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, + EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | + (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); +} + /* * Calculate the number of metadata blocks needed * to allocate @blocks @@ -1559,7 +1572,7 @@ found_extent: * allocated block. Thus, index entries have to be consistent * with leaves. */ -static ext4_lblk_t +ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path) { int depth; @@ -2854,24 +2867,14 @@ again: */ if (end >= ee_block && end < ee_block + ext4_ext_get_actual_len(ex) - 1) { - int split_flag = 0; - - if (ext4_ext_is_unwritten(ex)) - split_flag = EXT4_EXT_MARK_UNWRIT1 | - EXT4_EXT_MARK_UNWRIT2; - /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ - err = ext4_split_extent_at(handle, inode, path, - end + 1, split_flag, - EXT4_EX_NOCACHE | - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_METADATA_NOFAIL); - + err = ext4_force_split_extent_at(handle, inode, path, + end + 1, 1); if (err < 0) goto out; } @@ -5506,3 +5509,208 @@ out_mutex: mutex_unlock(&inode->i_mutex); return ret; } + +/** + * ext4_swap_extents - Swap extents between two inodes + * + * @inode1: First inode + * @inode2: Second inode + * @lblk1: Start block for first inode + * @lblk2: Start block for second inode + * @count: Number of blocks to swap + * @mark_unwritten: Mark second inode's extents as unwritten after swap + * @erp: Pointer to save error value + * + * This helper routine does exactly what is promise "swap extents". All other + * stuff such as page-cache locking consistency, bh mapping consistency or + * extent's data copying must be performed by caller. + * Locking: + * i_mutex is held for both inodes + * i_data_sem is locked for write for both inodes + * Assumptions: + * All pages from requested range are locked for both inodes + */ +int +ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, + ext4_lblk_t count, int unwritten, int *erp) +{ + struct ext4_ext_path *path1 = NULL; + struct ext4_ext_path *path2 = NULL; + int replaced_count = 0; + + BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); + BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + + *erp = ext4_es_remove_extent(inode1, lblk1, count); + if (*erp) + return 0; + *erp = ext4_es_remove_extent(inode2, lblk2, count); + if (*erp) + return 0; + + while (count) { + struct ext4_extent *ex1, *ex2, tmp_ex; + ext4_lblk_t e1_blk, e2_blk; + int e1_len, e2_len, len; + int split = 0; + + path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); + if (IS_ERR(path1)) { + *erp = PTR_ERR(path1); + break; + } + path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); + if (IS_ERR(path2)) { + *erp = PTR_ERR(path2); + break; + } + ex1 = path1[path1->p_depth].p_ext; + ex2 = path2[path2->p_depth].p_ext; + /* Do we have somthing to swap ? */ + if (unlikely(!ex2 || !ex1)) + break; + + e1_blk = le32_to_cpu(ex1->ee_block); + e2_blk = le32_to_cpu(ex2->ee_block); + e1_len = ext4_ext_get_actual_len(ex1); + e2_len = ext4_ext_get_actual_len(ex2); + + /* Hole handling */ + if (!in_range(lblk1, e1_blk, e1_len) || + !in_range(lblk2, e2_blk, e2_len)) { + ext4_lblk_t next1, next2; + + /* if hole after extent, then go to next extent */ + next1 = ext4_ext_next_allocated_block(path1); + next2 = ext4_ext_next_allocated_block(path2); + /* If hole before extent, then shift to that extent */ + if (e1_blk > lblk1) + next1 = e1_blk; + if (e2_blk > lblk2) + next2 = e1_blk; + /* Do we have something to swap */ + if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) + break; + /* Move to the rightest boundary */ + len = next1 - lblk1; + if (len < next2 - lblk2) + len = next2 - lblk2; + if (len > count) + len = count; + lblk1 += len; + lblk2 += len; + count -= len; + goto repeat; + } + + /* Prepare left boundary */ + if (e1_blk < lblk1) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode1, + path1, lblk1, 0); + if (*erp) + break; + } + if (e2_blk < lblk2) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode2, + path2, lblk2, 0); + if (*erp) + break; + } + /* ext4_split_extent_at() may retult in leaf extent split, + * path must to be revalidated. */ + if (split) + goto repeat; + + /* Prepare right boundary */ + len = count; + if (len > e1_blk + e1_len - lblk1) + len = e1_blk + e1_len - lblk1; + if (len > e2_blk + e2_len - lblk2) + len = e2_blk + e2_len - lblk2; + + if (len != e1_len) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode1, + path1, lblk1 + len, 0); + if (*erp) + break; + } + if (len != e2_len) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode2, + path2, lblk2 + len, 0); + if (*erp) + break; + } + /* ext4_split_extent_at() may retult in leaf extent split, + * path must to be revalidated. */ + if (split) + goto repeat; + + BUG_ON(e2_len != e1_len); + *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); + if (*erp) + break; + *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); + if (*erp) + break; + + /* Both extents are fully inside boundaries. Swap it now */ + tmp_ex = *ex1; + ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); + ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); + ex1->ee_len = cpu_to_le16(e2_len); + ex2->ee_len = cpu_to_le16(e1_len); + if (unwritten) + ext4_ext_mark_unwritten(ex2); + if (ext4_ext_is_unwritten(&tmp_ex)) + ext4_ext_mark_unwritten(ex1); + + ext4_ext_try_to_merge(handle, inode2, path2, ex2); + ext4_ext_try_to_merge(handle, inode1, path1, ex1); + *erp = ext4_ext_dirty(handle, inode2, path2 + + path2->p_depth); + if (*erp) + break; + *erp = ext4_ext_dirty(handle, inode1, path1 + + path1->p_depth); + /* + * Looks scarry ah..? second inode already points to new blocks, + * and it was successfully dirtied. But luckily error may happen + * only due to journal error, so full transaction will be + * aborted anyway. + */ + if (*erp) + break; + lblk1 += len; + lblk2 += len; + replaced_count += len; + count -= len; + + repeat: + if (path1) { + ext4_ext_drop_refs(path1); + kfree(path1); + path1 = NULL; + } + if (path2) { + ext4_ext_drop_refs(path2); + kfree(path2); + path2 = NULL; + } + } + if (path1) { + ext4_ext_drop_refs(path1); + kfree(path1); + } + if (path2) { + ext4_ext_drop_refs(path2); + kfree(path2); + } + return replaced_count; +} diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 123a51b05965..c8f895b410f6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -48,101 +48,6 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, return ret; } -/** - * copy_extent_status - Copy the extent's initialization status - * - * @src: an extent for getting initialize status - * @dest: an extent to be set the status - */ -static void -copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) -{ - if (ext4_ext_is_unwritten(src)) - ext4_ext_mark_unwritten(dest); - else - dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); -} - -/** - * mext_next_extent - Search for the next extent and set it to "extent" - * - * @inode: inode which is searched - * @path: this will obtain data for the next extent - * @extent: pointer to the next extent we have just gotten - * - * Search the next extent in the array of ext4_ext_path structure (@path) - * and set it to ext4_extent structure (@extent). In addition, the member of - * @path (->p_ext) also points the next extent. Return 0 on success, 1 if - * ext4_ext_path structure refers to the last extent, or a negative error - * value on failure. - */ -static int -mext_next_extent(struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent **extent) -{ - struct ext4_extent_header *eh; - int ppos, leaf_ppos = path->p_depth; - - ppos = leaf_ppos; - if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { - /* leaf block */ - *extent = ++path[ppos].p_ext; - path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); - return 0; - } - - while (--ppos >= 0) { - if (EXT_LAST_INDEX(path[ppos].p_hdr) > - path[ppos].p_idx) { - int cur_ppos = ppos; - - /* index block */ - path[ppos].p_idx++; - path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); - if (path[ppos+1].p_bh) - brelse(path[ppos+1].p_bh); - path[ppos+1].p_bh = - sb_bread(inode->i_sb, path[ppos].p_block); - if (!path[ppos+1].p_bh) - return -EIO; - path[ppos+1].p_hdr = - ext_block_hdr(path[ppos+1].p_bh); - - /* Halfway index block */ - while (++cur_ppos < leaf_ppos) { - path[cur_ppos].p_idx = - EXT_FIRST_INDEX(path[cur_ppos].p_hdr); - path[cur_ppos].p_block = - ext4_idx_pblock(path[cur_ppos].p_idx); - if (path[cur_ppos+1].p_bh) - brelse(path[cur_ppos+1].p_bh); - path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, - path[cur_ppos].p_block); - if (!path[cur_ppos+1].p_bh) - return -EIO; - path[cur_ppos+1].p_hdr = - ext_block_hdr(path[cur_ppos+1].p_bh); - } - - path[leaf_ppos].p_ext = *extent = NULL; - - eh = path[leaf_ppos].p_hdr; - if (le16_to_cpu(eh->eh_entries) == 0) - /* empty leaf is found */ - return -ENODATA; - - /* leaf block */ - path[leaf_ppos].p_ext = *extent = - EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); - path[leaf_ppos].p_block = - ext4_ext_pblock(path[leaf_ppos].p_ext); - return 0; - } - } - /* We found the last extent */ - return 1; -} - /** * ext4_double_down_write_data_sem - Acquire two inodes' write lock * of i_data_sem @@ -177,417 +82,6 @@ ext4_double_up_write_data_sem(struct inode *orig_inode, up_write(&EXT4_I(donor_inode)->i_data_sem); } -/** - * mext_insert_across_blocks - Insert extents across leaf block - * - * @handle: journal handle - * @orig_inode: original inode - * @o_start: first original extent to be changed - * @o_end: last original extent to be changed - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * - * Allocate a new leaf block and insert extents into it. Return 0 on success, - * or a negative error value on failure. - */ -static int -mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, - struct ext4_extent *o_start, struct ext4_extent *o_end, - struct ext4_extent *start_ext, struct ext4_extent *new_ext, - struct ext4_extent *end_ext) -{ - struct ext4_ext_path *orig_path = NULL; - ext4_lblk_t eblock = 0; - int new_flag = 0; - int end_flag = 0; - int err = 0; - - if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) { - if (o_start == o_end) { - - /* start_ext new_ext end_ext - * donor |---------|-----------|--------| - * orig |------------------------------| - */ - end_flag = 1; - } else { - - /* start_ext new_ext end_ext - * donor |---------|----------|---------| - * orig |---------------|--------------| - */ - o_end->ee_block = end_ext->ee_block; - o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); - } - - o_start->ee_len = start_ext->ee_len; - eblock = le32_to_cpu(start_ext->ee_block); - new_flag = 1; - - } else if (start_ext->ee_len && new_ext->ee_len && - !end_ext->ee_len && o_start == o_end) { - - /* start_ext new_ext - * donor |--------------|---------------| - * orig |------------------------------| - */ - o_start->ee_len = start_ext->ee_len; - eblock = le32_to_cpu(start_ext->ee_block); - new_flag = 1; - - } else if (!start_ext->ee_len && new_ext->ee_len && - end_ext->ee_len && o_start == o_end) { - - /* new_ext end_ext - * donor |--------------|---------------| - * orig |------------------------------| - */ - o_end->ee_block = end_ext->ee_block; - o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); - - /* - * Set 0 to the extent block if new_ext was - * the first block. - */ - if (new_ext->ee_block) - eblock = le32_to_cpu(new_ext->ee_block); - - new_flag = 1; - } else { - ext4_debug("ext4 move extent: Unexpected insert case\n"); - return -EIO; - } - - if (new_flag) { - err = get_ext_path(orig_inode, eblock, &orig_path); - if (err) - goto out; - - if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, new_ext, 0)) - goto out; - } - - if (end_flag) { - err = get_ext_path(orig_inode, - le32_to_cpu(end_ext->ee_block) - 1, &orig_path); - if (err) - goto out; - - if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, end_ext, 0)) - goto out; - } -out: - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - - return err; - -} - -/** - * mext_insert_inside_block - Insert new extent to the extent block - * - * @o_start: first original extent to be moved - * @o_end: last original extent to be moved - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * @eh: extent header of target leaf block - * @range_to_move: used to decide how to insert extent - * - * Insert extents into the leaf block. The extent (@o_start) is overwritten - * by inserted extents. - */ -static void -mext_insert_inside_block(struct ext4_extent *o_start, - struct ext4_extent *o_end, - struct ext4_extent *start_ext, - struct ext4_extent *new_ext, - struct ext4_extent *end_ext, - struct ext4_extent_header *eh, - int range_to_move) -{ - int i = 0; - unsigned long len; - - /* Move the existing extents */ - if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) { - len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) - - (unsigned long)(o_end + 1); - memmove(o_end + 1 + range_to_move, o_end + 1, len); - } - - /* Insert start entry */ - if (start_ext->ee_len) - o_start[i++].ee_len = start_ext->ee_len; - - /* Insert new entry */ - if (new_ext->ee_len) { - o_start[i] = *new_ext; - ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext)); - } - - /* Insert end entry */ - if (end_ext->ee_len) - o_start[i] = *end_ext; - - /* Increment the total entries counter on the extent block */ - le16_add_cpu(&eh->eh_entries, range_to_move); -} - -/** - * mext_insert_extents - Insert new extent - * - * @handle: journal handle - * @orig_inode: original inode - * @orig_path: path indicates first extent to be changed - * @o_start: first original extent to be changed - * @o_end: last original extent to be changed - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * - * Call the function to insert extents. If we cannot add more extents into - * the leaf block, we call mext_insert_across_blocks() to create a - * new leaf block. Otherwise call mext_insert_inside_block(). Return 0 - * on success, or a negative error value on failure. - */ -static int -mext_insert_extents(handle_t *handle, struct inode *orig_inode, - struct ext4_ext_path *orig_path, - struct ext4_extent *o_start, - struct ext4_extent *o_end, - struct ext4_extent *start_ext, - struct ext4_extent *new_ext, - struct ext4_extent *end_ext) -{ - struct ext4_extent_header *eh; - unsigned long need_slots, slots_range; - int range_to_move, depth, ret; - - /* - * The extents need to be inserted - * start_extent + new_extent + end_extent. - */ - need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) + - (new_ext->ee_len ? 1 : 0); - - /* The number of slots between start and end */ - slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1) - / sizeof(struct ext4_extent); - - /* Range to move the end of extent */ - range_to_move = need_slots - slots_range; - depth = orig_path->p_depth; - orig_path += depth; - eh = orig_path->p_hdr; - - if (depth) { - /* Register to journal */ - BUFFER_TRACE(orig_path->p_bh, "get_write_access"); - ret = ext4_journal_get_write_access(handle, orig_path->p_bh); - if (ret) - return ret; - } - - /* Expansion */ - if (range_to_move > 0 && - (range_to_move > le16_to_cpu(eh->eh_max) - - le16_to_cpu(eh->eh_entries))) { - - ret = mext_insert_across_blocks(handle, orig_inode, o_start, - o_end, start_ext, new_ext, end_ext); - if (ret < 0) - return ret; - } else - mext_insert_inside_block(o_start, o_end, start_ext, new_ext, - end_ext, eh, range_to_move); - - return ext4_ext_dirty(handle, orig_inode, orig_path); -} - -/** - * mext_leaf_block - Move one leaf extent block into the inode. - * - * @handle: journal handle - * @orig_inode: original inode - * @orig_path: path indicates first extent to be changed - * @dext: donor extent - * @from: start offset on the target file - * - * In order to insert extents into the leaf block, we must divide the extent - * in the leaf block into three extents. The one is located to be inserted - * extents, and the others are located around it. - * - * Therefore, this function creates structures to save extents of the leaf - * block, and inserts extents by calling mext_insert_extents() with - * created extents. Return 0 on success, or a negative error value on failure. - */ -static int -mext_leaf_block(handle_t *handle, struct inode *orig_inode, - struct ext4_ext_path *orig_path, struct ext4_extent *dext, - ext4_lblk_t *from) -{ - struct ext4_extent *oext, *o_start, *o_end, *prev_ext; - struct ext4_extent new_ext, start_ext, end_ext; - ext4_lblk_t new_ext_end; - int oext_alen, new_ext_alen, end_ext_alen; - int depth = ext_depth(orig_inode); - int ret; - - start_ext.ee_block = end_ext.ee_block = 0; - o_start = o_end = oext = orig_path[depth].p_ext; - oext_alen = ext4_ext_get_actual_len(oext); - start_ext.ee_len = end_ext.ee_len = 0; - - new_ext.ee_block = cpu_to_le32(*from); - ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext)); - new_ext.ee_len = dext->ee_len; - new_ext_alen = ext4_ext_get_actual_len(&new_ext); - new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; - - /* - * Case: original extent is first - * oext |--------| - * new_ext |--| - * start_ext |--| - */ - if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) && - le32_to_cpu(new_ext.ee_block) < - le32_to_cpu(oext->ee_block) + oext_alen) { - start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - - le32_to_cpu(oext->ee_block)); - start_ext.ee_block = oext->ee_block; - copy_extent_status(oext, &start_ext); - } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { - prev_ext = oext - 1; - /* - * We can merge new_ext into previous extent, - * if these are contiguous and same extent type. - */ - if (ext4_can_extents_be_merged(orig_inode, prev_ext, - &new_ext)) { - o_start = prev_ext; - start_ext.ee_len = cpu_to_le16( - ext4_ext_get_actual_len(prev_ext) + - new_ext_alen); - start_ext.ee_block = oext->ee_block; - copy_extent_status(prev_ext, &start_ext); - new_ext.ee_len = 0; - } - } - - /* - * Case: new_ext_end must be less than oext - * oext |-----------| - * new_ext |-------| - */ - if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { - EXT4_ERROR_INODE(orig_inode, - "new_ext_end(%u) should be less than or equal to " - "oext->ee_block(%u) + oext_alen(%d) - 1", - new_ext_end, le32_to_cpu(oext->ee_block), - oext_alen); - ret = -EIO; - goto out; - } - - /* - * Case: new_ext is smaller than original extent - * oext |---------------| - * new_ext |-----------| - * end_ext |---| - */ - if (le32_to_cpu(oext->ee_block) <= new_ext_end && - new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) { - end_ext.ee_len = - cpu_to_le16(le32_to_cpu(oext->ee_block) + - oext_alen - 1 - new_ext_end); - copy_extent_status(oext, &end_ext); - end_ext_alen = ext4_ext_get_actual_len(&end_ext); - ext4_ext_store_pblock(&end_ext, - (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen)); - end_ext.ee_block = - cpu_to_le32(le32_to_cpu(o_end->ee_block) + - oext_alen - end_ext_alen); - } - - ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, - o_end, &start_ext, &new_ext, &end_ext); -out: - return ret; -} - -/** - * mext_calc_swap_extents - Calculate extents for extent swapping. - * - * @tmp_dext: the extent that will belong to the original inode - * @tmp_oext: the extent that will belong to the donor inode - * @orig_off: block offset of original inode - * @donor_off: block offset of donor inode - * @max_count: the maximum length of extents - * - * Return 0 on success, or a negative error value on failure. - */ -static int -mext_calc_swap_extents(struct ext4_extent *tmp_dext, - struct ext4_extent *tmp_oext, - ext4_lblk_t orig_off, ext4_lblk_t donor_off, - ext4_lblk_t max_count) -{ - ext4_lblk_t diff, orig_diff; - struct ext4_extent dext_old, oext_old; - - BUG_ON(orig_off != donor_off); - - /* original and donor extents have to cover the same block offset */ - if (orig_off < le32_to_cpu(tmp_oext->ee_block) || - le32_to_cpu(tmp_oext->ee_block) + - ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) - return -ENODATA; - - if (orig_off < le32_to_cpu(tmp_dext->ee_block) || - le32_to_cpu(tmp_dext->ee_block) + - ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) - return -ENODATA; - - dext_old = *tmp_dext; - oext_old = *tmp_oext; - - /* When tmp_dext is too large, pick up the target range. */ - diff = donor_off - le32_to_cpu(tmp_dext->ee_block); - - ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); - le32_add_cpu(&tmp_dext->ee_block, diff); - le16_add_cpu(&tmp_dext->ee_len, -diff); - - if (max_count < ext4_ext_get_actual_len(tmp_dext)) - tmp_dext->ee_len = cpu_to_le16(max_count); - - orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); - ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff); - - /* Adjust extent length if donor extent is larger than orig */ - if (ext4_ext_get_actual_len(tmp_dext) > - ext4_ext_get_actual_len(tmp_oext) - orig_diff) - tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) - - orig_diff); - - tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext)); - - copy_extent_status(&oext_old, tmp_dext); - copy_extent_status(&dext_old, tmp_oext); - - return 0; -} - /** * mext_check_coverage - Check that all extents in range has the same type * @@ -647,129 +141,6 @@ out: * * Return replaced block count. */ -static int -mext_replace_branches(handle_t *handle, struct inode *orig_inode, - struct inode *donor_inode, ext4_lblk_t from, - ext4_lblk_t count, int *err) -{ - struct ext4_ext_path *orig_path = NULL; - struct ext4_ext_path *donor_path = NULL; - struct ext4_extent *oext, *dext; - struct ext4_extent tmp_dext, tmp_oext; - ext4_lblk_t orig_off = from, donor_off = from; - int depth; - int replaced_count = 0; - int dext_alen; - - *err = ext4_es_remove_extent(orig_inode, from, count); - if (*err) - goto out; - - *err = ext4_es_remove_extent(donor_inode, from, count); - if (*err) - goto out; - - /* Get the original extent for the block "orig_off" */ - *err = get_ext_path(orig_inode, orig_off, &orig_path); - if (*err) - goto out; - - /* Get the donor extent for the head */ - *err = get_ext_path(donor_inode, donor_off, &donor_path); - if (*err) - goto out; - depth = ext_depth(orig_inode); - oext = orig_path[depth].p_ext; - tmp_oext = *oext; - - depth = ext_depth(donor_inode); - dext = donor_path[depth].p_ext; - if (unlikely(!dext)) - goto missing_donor_extent; - tmp_dext = *dext; - - *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, count); - if (*err) - goto out; - - /* Loop for the donor extents */ - while (1) { - /* The extent for donor must be found. */ - if (unlikely(!dext)) { - missing_donor_extent: - EXT4_ERROR_INODE(donor_inode, - "The extent for donor must be found"); - *err = -EIO; - goto out; - } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { - EXT4_ERROR_INODE(donor_inode, - "Donor offset(%u) and the first block of donor " - "extent(%u) should be equal", - donor_off, - le32_to_cpu(tmp_dext.ee_block)); - *err = -EIO; - goto out; - } - - /* Set donor extent to orig extent */ - *err = mext_leaf_block(handle, orig_inode, - orig_path, &tmp_dext, &orig_off); - if (*err) - goto out; - - /* Set orig extent to donor extent */ - *err = mext_leaf_block(handle, donor_inode, - donor_path, &tmp_oext, &donor_off); - if (*err) - goto out; - - dext_alen = ext4_ext_get_actual_len(&tmp_dext); - replaced_count += dext_alen; - donor_off += dext_alen; - orig_off += dext_alen; - - BUG_ON(replaced_count > count); - /* Already moved the expected blocks */ - if (replaced_count >= count) - break; - - if (orig_path) - ext4_ext_drop_refs(orig_path); - *err = get_ext_path(orig_inode, orig_off, &orig_path); - if (*err) - goto out; - depth = ext_depth(orig_inode); - oext = orig_path[depth].p_ext; - tmp_oext = *oext; - - if (donor_path) - ext4_ext_drop_refs(donor_path); - *err = get_ext_path(donor_inode, donor_off, &donor_path); - if (*err) - goto out; - depth = ext_depth(donor_inode); - dext = donor_path[depth].p_ext; - tmp_dext = *dext; - - *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, count - replaced_count); - if (*err) - goto out; - } - -out: - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - if (donor_path) { - ext4_ext_drop_refs(donor_path); - kfree(donor_path); - } - - return replaced_count; -} /** * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 @@ -783,7 +154,7 @@ out: */ static int mext_page_double_lock(struct inode *inode1, struct inode *inode2, - pgoff_t index, struct page *page[2]) + pgoff_t index1, pgoff_t index2, struct page *page[2]) { struct address_space *mapping[2]; unsigned fl = AOP_FLAG_NOFS; @@ -793,15 +164,18 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, mapping[0] = inode1->i_mapping; mapping[1] = inode2->i_mapping; } else { + pgoff_t tmp = index1; + index1 = index2; + index2 = tmp; mapping[0] = inode2->i_mapping; mapping[1] = inode1->i_mapping; } - page[0] = grab_cache_page_write_begin(mapping[0], index, fl); + page[0] = grab_cache_page_write_begin(mapping[0], index1, fl); if (!page[0]) return -ENOMEM; - page[1] = grab_cache_page_write_begin(mapping[1], index, fl); + page[1] = grab_cache_page_write_begin(mapping[1], index2, fl); if (!page[1]) { unlock_page(page[0]); page_cache_release(page[0]); @@ -905,13 +279,14 @@ out: */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, - pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int unwritten, int *err) + pgoff_t orig_page_offset, pgoff_t donor_page_offset, + int data_offset_in_page, + int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; handle_t *handle; - ext4_lblk_t orig_blk_offset; + ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; @@ -939,6 +314,9 @@ again: orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; + donor_blk_offset = donor_page_offset * blocks_per_page + + data_offset_in_page; + /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { @@ -959,7 +337,7 @@ again: replaced_size = data_size; *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, - pagep); + donor_page_offset, pagep); if (unlikely(*err < 0)) goto stop_journal; /* @@ -978,7 +356,7 @@ again: if (*err) goto drop_data_sem; - unwritten &= mext_check_coverage(donor_inode, orig_blk_offset, + unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; @@ -994,9 +372,10 @@ again: *err = -EBUSY; goto drop_data_sem; } - replaced_count = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page, err); + replaced_count = ext4_swap_extents(handle, orig_inode, + donor_inode, orig_blk_offset, + donor_blk_offset, + block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_pages; @@ -1014,9 +393,9 @@ data_copy: goto unlock_pages; } ext4_double_down_write_data_sem(orig_inode, donor_inode); - replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, - block_len_in_page, err); + replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, + orig_blk_offset, donor_blk_offset, + block_len_in_page, 1, err); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (*err) { if (replaced_count) { @@ -1061,9 +440,9 @@ repair_branches: * Try to swap extents to it's original places */ ext4_double_down_write_data_sem(orig_inode, donor_inode); - replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, - orig_blk_offset, - block_len_in_page, &err2); + replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, + orig_blk_offset, donor_blk_offset, + block_len_in_page, 0, &err2); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), @@ -1093,10 +472,14 @@ mext_check_arguments(struct inode *orig_inode, struct inode *donor_inode, __u64 orig_start, __u64 donor_start, __u64 *len) { - ext4_lblk_t orig_blocks, donor_blocks; + __u64 orig_eof, donor_eof; unsigned int blkbits = orig_inode->i_blkbits; unsigned int blocksize = 1 << blkbits; + orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; + donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; + + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" " to donor file [ino:orig %lu, donor %lu]\n", @@ -1112,7 +495,7 @@ mext_check_arguments(struct inode *orig_inode, ext4_debug("ext4 move extent: The argument files should " "not be swapfile [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; + return -EBUSY; } /* Ext4 move extent supports only extent based file */ @@ -1132,67 +515,28 @@ mext_check_arguments(struct inode *orig_inode, } /* Start offset should be same */ - if (orig_start != donor_start) { + if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != + (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { ext4_debug("ext4 move extent: orig and donor's start " - "offset are not same [ino:orig %lu, donor %lu]\n", + "offset are not alligned [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || (*len > EXT_MAX_BLOCKS) || + (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - - if (orig_inode->i_size > donor_inode->i_size) { - donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; - /* TODO: eliminate this artificial restriction */ - if (orig_start >= donor_blocks) { - ext4_debug("ext4 move extent: orig start offset " - "[%llu] should be less than donor file blocks " - "[%u] [ino:orig %lu, donor %lu]\n", - orig_start, donor_blocks, - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* TODO: eliminate this artificial restriction */ - if (orig_start + *len > donor_blocks) { - ext4_debug("ext4 move extent: End offset [%llu] should " - "be less than donor file blocks [%u]." - "So adjust length from %llu to %llu " - "[ino:orig %lu, donor %lu]\n", - orig_start + *len, donor_blocks, - *len, donor_blocks - orig_start, - orig_inode->i_ino, donor_inode->i_ino); - *len = donor_blocks - orig_start; - } - } else { - orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; - if (orig_start >= orig_blocks) { - ext4_debug("ext4 move extent: start offset [%llu] " - "should be less than original file blocks " - "[%u] [ino:orig %lu, donor %lu]\n", - orig_start, orig_blocks, - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - if (orig_start + *len > orig_blocks) { - ext4_debug("ext4 move extent: Adjust length " - "from %llu to %llu. Because it should be " - "less than original file blocks " - "[ino:orig %lu, donor %lu]\n", - *len, orig_blocks - orig_start, - orig_inode->i_ino, donor_inode->i_ino); - *len = orig_blocks - orig_start; - } - } - + if (orig_eof < orig_start + *len - 1) + *len = orig_eof - orig_start; + if (donor_eof < donor_start + *len - 1) + *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, @@ -1245,23 +589,16 @@ mext_check_arguments(struct inode *orig_inode, * 7:Return 0 on success, or a negative error value on failure. */ int -ext4_move_extents(struct file *o_filp, struct file *d_filp, - __u64 orig_start, __u64 donor_start, __u64 len, - __u64 *moved_len) +ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, + __u64 donor_blk, __u64 len, __u64 *moved_len) { struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); - struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; - struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; - ext4_lblk_t block_start = orig_start; - ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; - ext4_lblk_t rest_blocks; - pgoff_t orig_page_offset = 0, seq_end_page; - int ret, depth, last_extent = 0; + struct ext4_ext_path *path = NULL; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; - int data_offset_in_page; - int block_len_in_page; - int unwritten; + ext4_lblk_t o_end, o_start = orig_blk; + ext4_lblk_t d_start = donor_blk; + int ret; if (orig_inode->i_sb != donor_inode->i_sb) { ext4_debug("ext4 move extent: The argument files " @@ -1303,121 +640,58 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ - ret = mext_check_arguments(orig_inode, donor_inode, orig_start, - donor_start, &len); + ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, + donor_blk, &len); if (ret) goto out; + o_end = o_start + len; - file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; - block_end = block_start + len - 1; - if (file_end < block_end) - len -= block_end - file_end; - - ret = get_ext_path(orig_inode, block_start, &orig_path); - if (ret) - goto out; - - /* Get path structure to check the hole */ - ret = get_ext_path(orig_inode, block_start, &holecheck_path); - if (ret) - goto out; + while (o_start < o_end) { + struct ext4_extent *ex; + ext4_lblk_t cur_blk, next_blk; + pgoff_t orig_page_index, donor_page_index; + int offset_in_page; + int unwritten, cur_len; - depth = ext_depth(orig_inode); - ext_cur = holecheck_path[depth].p_ext; - - /* - * Get proper starting location of block replacement if block_start was - * within the hole. - */ - if (le32_to_cpu(ext_cur->ee_block) + - ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { - /* - * The hole exists between extents or the tail of - * original file. - */ - last_extent = mext_next_extent(orig_inode, - holecheck_path, &ext_cur); - if (last_extent < 0) { - ret = last_extent; - goto out; - } - last_extent = mext_next_extent(orig_inode, orig_path, - &ext_dummy); - if (last_extent < 0) { - ret = last_extent; + ret = get_ext_path(orig_inode, o_start, &path); + if (ret) goto out; + ex = path[path->p_depth].p_ext; + next_blk = ext4_ext_next_allocated_block(path); + cur_blk = le32_to_cpu(ex->ee_block); + cur_len = ext4_ext_get_actual_len(ex); + /* Check hole before the start pos */ + if (cur_blk + cur_len - 1 < o_start) { + if (next_blk == EXT_MAX_BLOCKS) { + o_start = o_end; + ret = -ENODATA; + goto out; + } + d_start += next_blk - o_start; + o_start = next_blk; + goto repeat; + /* Check hole after the start pos */ + } else if (cur_blk > o_start) { + /* Skip hole */ + d_start += cur_blk - o_start; + o_start = cur_blk; + /* Extent inside requested range ?*/ + if (cur_blk >= o_end) + goto out; + } else { /* in_range(o_start, o_blk, o_len) */ + cur_len += cur_blk - o_start; } - seq_start = le32_to_cpu(ext_cur->ee_block); - } else if (le32_to_cpu(ext_cur->ee_block) > block_start) - /* The hole exists at the beginning of original file. */ - seq_start = le32_to_cpu(ext_cur->ee_block); - else - seq_start = block_start; - - /* No blocks within the specified range. */ - if (le32_to_cpu(ext_cur->ee_block) > block_end) { - ext4_debug("ext4 move extent: The specified range of file " - "may be the hole\n"); - ret = -EINVAL; - goto out; - } - - /* Adjust start blocks */ - add_blocks = min(le32_to_cpu(ext_cur->ee_block) + - ext4_ext_get_actual_len(ext_cur), block_end + 1) - - max(le32_to_cpu(ext_cur->ee_block), block_start); - - while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { - seq_blocks += add_blocks; - - /* Adjust tail blocks */ - if (seq_start + seq_blocks - 1 > block_end) - seq_blocks = block_end - seq_start + 1; - - ext_prev = ext_cur; - last_extent = mext_next_extent(orig_inode, holecheck_path, - &ext_cur); - if (last_extent < 0) { - ret = last_extent; - break; - } - add_blocks = ext4_ext_get_actual_len(ext_cur); - - /* - * Extend the length of contiguous block (seq_blocks) - * if extents are contiguous. - */ - if (ext4_can_extents_be_merged(orig_inode, - ext_prev, ext_cur) && - block_end >= le32_to_cpu(ext_cur->ee_block) && - !last_extent) - continue; - - /* Is original extent is unwritten */ - unwritten = ext4_ext_is_unwritten(ext_prev); - - data_offset_in_page = seq_start % blocks_per_page; - - /* - * Calculate data blocks count that should be swapped - * at the first page. - */ - if (data_offset_in_page + seq_blocks > blocks_per_page) { - /* Swapped blocks are across pages */ - block_len_in_page = - blocks_per_page - data_offset_in_page; - } else { - /* Swapped blocks are in a page */ - block_len_in_page = seq_blocks; - } - - orig_page_offset = seq_start >> - (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); - seq_end_page = (seq_start + seq_blocks - 1) >> - (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); - seq_start = le32_to_cpu(ext_cur->ee_block); - rest_blocks = seq_blocks; - + unwritten = ext4_ext_is_unwritten(ex); + if (o_end - o_start < cur_len) + cur_len = o_end - o_start; + + orig_page_index = o_start >> (PAGE_CACHE_SHIFT - + orig_inode->i_blkbits); + donor_page_index = d_start >> (PAGE_CACHE_SHIFT - + donor_inode->i_blkbits); + offset_in_page = o_start % blocks_per_page; + if (cur_len > blocks_per_page- offset_in_page) + cur_len = blocks_per_page - offset_in_page; /* * Up semaphore to avoid following problems: * a. transaction deadlock among ext4_journal_start, @@ -1426,76 +700,36 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, * in move_extent_per_page */ ext4_double_up_write_data_sem(orig_inode, donor_inode); - - while (orig_page_offset <= seq_end_page) { - - /* Swap original branches with new branches */ - block_len_in_page = move_extent_per_page( - o_filp, donor_inode, - orig_page_offset, - data_offset_in_page, - block_len_in_page, - unwritten, &ret); - - /* Count how many blocks we have exchanged */ - *moved_len += block_len_in_page; - if (ret < 0) - break; - if (*moved_len > len) { - EXT4_ERROR_INODE(orig_inode, - "We replaced blocks too much! " - "sum of replaced: %llu requested: %llu", - *moved_len, len); - ret = -EIO; - break; - } - - orig_page_offset++; - data_offset_in_page = 0; - rest_blocks -= block_len_in_page; - if (rest_blocks > blocks_per_page) - block_len_in_page = blocks_per_page; - else - block_len_in_page = rest_blocks; - } - + /* Swap original branches with new branches */ + move_extent_per_page(o_filp, donor_inode, + orig_page_index, donor_page_index, + offset_in_page, cur_len, + unwritten, &ret); ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; - - /* Decrease buffer counter */ - if (holecheck_path) - ext4_ext_drop_refs(holecheck_path); - ret = get_ext_path(orig_inode, seq_start, &holecheck_path); - if (ret) - break; - depth = holecheck_path->p_depth; - - /* Decrease buffer counter */ - if (orig_path) - ext4_ext_drop_refs(orig_path); - ret = get_ext_path(orig_inode, seq_start, &orig_path); - if (ret) - break; - - ext_cur = holecheck_path[depth].p_ext; - add_blocks = ext4_ext_get_actual_len(ext_cur); - seq_blocks = 0; - + o_start += cur_len; + d_start += cur_len; + repeat: + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + path = NULL; + } } + *moved_len = o_start - orig_blk; + if (*moved_len > len) + *moved_len = len; + out: if (*moved_len) { ext4_discard_preallocations(orig_inode); ext4_discard_preallocations(donor_inode); } - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - if (holecheck_path) { - ext4_ext_drop_refs(holecheck_path); - kfree(holecheck_path); + if (path) { + ext4_ext_drop_refs(path); + kfree(path); } ext4_double_up_write_data_sem(orig_inode, donor_inode); ext4_inode_resume_unlocked_dio(orig_inode); -- cgit v1.2.3 From 19008f6dfa16d23afcd09dceaa598bb6da8de4b1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 31 Aug 2014 15:03:14 -0400 Subject: ext4: fix ext4_swap_extents() error handling If ext4_ext_find_extent() returns an error, we have to clear path1 or path2 or else we would end up trying to free an ERR_PTR, which would be bad. Also eliminate some redundant code and mark the error paths as unlikely() Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 62 ++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 33 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 73d9ae9a16db..d00937336f19 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -869,7 +869,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, if (!path) { path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), GFP_NOFS); - if (!path) + if (unlikely(!path)) return ERR_PTR(-ENOMEM); alloc = 1; } @@ -889,7 +889,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, bh = read_extent_tree_block(inode, path[ppos].p_block, --i, flags); - if (IS_ERR(bh)) { + if (unlikely(IS_ERR(bh))) { ret = PTR_ERR(bh); goto err; } @@ -5545,10 +5545,10 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!mutex_is_locked(&inode1->i_mutex)); *erp = ext4_es_remove_extent(inode1, lblk1, count); - if (*erp) + if (unlikely(*erp)) return 0; *erp = ext4_es_remove_extent(inode2, lblk2, count); - if (*erp) + if (unlikely(*erp)) return 0; while (count) { @@ -5558,20 +5558,24 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, int split = 0; path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); - if (IS_ERR(path1)) { + if (unlikely(IS_ERR(path1))) { *erp = PTR_ERR(path1); - break; + path1 = NULL; + finish: + count = 0; + goto repeat; } path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); - if (IS_ERR(path2)) { + if (unlikely(IS_ERR(path2))) { *erp = PTR_ERR(path2); - break; + path2 = NULL; + goto finish; } ex1 = path1[path1->p_depth].p_ext; ex2 = path2[path2->p_depth].p_ext; /* Do we have somthing to swap ? */ if (unlikely(!ex2 || !ex1)) - break; + goto finish; e1_blk = le32_to_cpu(ex1->ee_block); e2_blk = le32_to_cpu(ex2->ee_block); @@ -5593,7 +5597,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, next2 = e1_blk; /* Do we have something to swap */ if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) - break; + goto finish; /* Move to the rightest boundary */ len = next1 - lblk1; if (len < next2 - lblk2) @@ -5611,15 +5615,15 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, split = 1; *erp = ext4_force_split_extent_at(handle, inode1, path1, lblk1, 0); - if (*erp) - break; + if (unlikely(*erp)) + goto finish; } if (e2_blk < lblk2) { split = 1; *erp = ext4_force_split_extent_at(handle, inode2, path2, lblk2, 0); - if (*erp) - break; + if (unlikely(*erp)) + goto finish; } /* ext4_split_extent_at() may retult in leaf extent split, * path must to be revalidated. */ @@ -5637,15 +5641,15 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, split = 1; *erp = ext4_force_split_extent_at(handle, inode1, path1, lblk1 + len, 0); - if (*erp) - break; + if (unlikely(*erp)) + goto finish; } if (len != e2_len) { split = 1; *erp = ext4_force_split_extent_at(handle, inode2, path2, lblk2 + len, 0); if (*erp) - break; + goto finish; } /* ext4_split_extent_at() may retult in leaf extent split, * path must to be revalidated. */ @@ -5654,11 +5658,11 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(e2_len != e1_len); *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); - if (*erp) - break; + if (unlikely(*erp)) + goto finish; *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); - if (*erp) - break; + if (unlikely(*erp)) + goto finish; /* Both extents are fully inside boundaries. Swap it now */ tmp_ex = *ex1; @@ -5675,8 +5679,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, ext4_ext_try_to_merge(handle, inode1, path1, ex1); *erp = ext4_ext_dirty(handle, inode2, path2 + path2->p_depth); - if (*erp) - break; + if (unlikely(*erp)) + goto finish; *erp = ext4_ext_dirty(handle, inode1, path1 + path1->p_depth); /* @@ -5685,8 +5689,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, * only due to journal error, so full transaction will be * aborted anyway. */ - if (*erp) - break; + if (unlikely(*erp)) + goto finish; lblk1 += len; lblk2 += len; replaced_count += len; @@ -5704,13 +5708,5 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, path2 = NULL; } } - if (path1) { - ext4_ext_drop_refs(path1); - kfree(path1); - } - if (path2) { - ext4_ext_drop_refs(path2); - kfree(path2); - } return replaced_count; } -- cgit v1.2.3 From 713e8dde3e71e92db2d8cc8459d236ce1fb576ce Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:32:09 -0400 Subject: ext4: fix ZERO_RANGE bug hidden by flag aliasing We accidently aliased EXT4_EX_NOCACHE and EXT4_GET_CONVERT_UNWRITTEN falgs, which apparently was hiding a bug that was unmasked when this flag aliasing issue was addressed (see the subsequent commit). The reproduction case was: fsx -N 10000 -l 500000 -r 4096 -t 4096 -w 4096 -Z -R -W /vdb/junk ... which would cause fsx to report corruption in the data file. The fix we have is a bit of an overkill, but I'd much rather be conservative for now, and we can optimize ZERO_RANGE_FL handling later. The fact that we need to zap the extent_status cache for the inode is unfortunate, but correctness is far more important than performance. Signed-off-by: Theodore Ts'o Cc: Namjae Jeon --- fs/ext4/extents.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d00937336f19..bf205f72be35 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4802,7 +4802,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, max_blocks -= lblk; flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; @@ -4840,15 +4841,21 @@ static long ext4_zero_range(struct file *file, loff_t offset, ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); + if (ret) + goto out_dio; /* * Remove entire range from the extent status tree. + * + * ext4_es_remove_extent(inode, lblk, max_blocks) is + * NOT sufficient. I'm not sure why this is the case, + * but let's be conservative and remove the extent + * status tree for the entire inode. There should be + * no outstanding delalloc extents thanks to the + * filemap_write_and_wait_range() call above. */ - ret = ext4_es_remove_extent(inode, lblk, max_blocks); - if (ret) - goto out_dio; - - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags, mode); + ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (ret) goto out_dio; } -- cgit v1.2.3 From bd30d702fc320085f178d22866b32fdc4736c991 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:33:09 -0400 Subject: ext4: fix accidental flag aliasing in ext4_map_blocks flags Commit b8a8684502a0f introduced an accidental flag aliasing between EXT4_EX_NOCACHE and EXT4_GET_BLOCKS_CONVERT_UNWRITTEN. Fortunately, this didn't introduce any untorward side effects --- we got lucky. Nevertheless, fix this and leave a warning to hopefully avoid this from happening in the future. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cf3ad75d3015..550b4f99a843 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -569,6 +569,7 @@ enum { #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* Convert written extents to unwritten */ #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 +/* DO NOT ASSIGN ADDITIONAL FLAG VALUES WITHOUT ADJUSTING THE FLAGS BELOW */ /* * The bit position of these flags must not overlap with any of the @@ -579,8 +580,8 @@ enum { * caching the extents when reading from the extent tree while a * truncate or punch hole operation is in progress. */ -#define EXT4_EX_NOCACHE 0x0400 -#define EXT4_EX_FORCE_CACHE 0x0800 +#define EXT4_EX_NOCACHE 0x0800 +#define EXT4_EX_FORCE_CACHE 0x1000 /* * Flags used by ext4_free_blocks -- cgit v1.2.3 From 705912ca95f4bbdbb3be753e46bf30d6be15a5e8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:34:09 -0400 Subject: ext4: teach ext4_ext_find_extent() to free path on error Right now, there are a places where it is all to easy to leak memory on an error path, via a usage like this: struct ext4_ext_path *path = NULL while (...) { ... path = ext4_ext_find_extent(inode, block, path, 0); if (IS_ERR(path)) { /* oops, if path was non-NULL before the call to ext4_ext_find_extent, we've leaked it! :-( */ ... return PTR_ERR(path); } ... } Unfortunately, there some code paths where we are doing the following instead: path = ext4_ext_find_extent(inode, block, orig_path, 0); and where it's important that we _not_ free orig_path in the case where ext4_ext_find_extent() returns an error. So change the function signature of ext4_ext_find_extent() so that it takes a struct ext4_ext_path ** for its third argument, and by default, on an error, it will free the struct ext4_ext_path, and then zero out the struct ext4_ext_path * pointer. In order to avoid causing problems, we add a flag EXT4_EX_NOFREE_ON_ERR which causes ext4_ext_find_extent() to use the original behavior of forcing the caller to deal with freeing the original path pointer on the error case. The goal is to get rid of EXT4_EX_NOFREE_ON_ERR entirely, but this allows for a gentle transition and makes the patches easier to verify. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 ++- fs/ext4/extents.c | 28 ++++++++++++++++++---------- fs/ext4/move_extent.c | 2 +- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 550b4f99a843..696e51ae02fa 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -582,6 +582,7 @@ enum { */ #define EXT4_EX_NOCACHE 0x0800 #define EXT4_EX_FORCE_CACHE 0x1000 +#define EXT4_EX_NOFREE_ON_ERR 0x2000 /* * Flags used by ext4_free_blocks @@ -2733,7 +2734,7 @@ extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *, int); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *, + struct ext4_ext_path **, int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bf205f72be35..0ced78c974e2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -855,11 +855,13 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) struct ext4_ext_path * ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path *path, int flags) + struct ext4_ext_path **orig_path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; - short int depth, i, ppos = 0, alloc = 0; + struct ext4_ext_path *path = orig_path ? *orig_path : NULL; + short int depth, i, ppos = 0; + short free_on_err = (flags & EXT4_EX_NOFREE_ON_ERR) == 0; int ret; eh = ext_inode_hdr(inode); @@ -871,7 +873,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, GFP_NOFS); if (unlikely(!path)) return ERR_PTR(-ENOMEM); - alloc = 1; + free_on_err = 1; } path[0].p_hdr = eh; path[0].p_bh = NULL; @@ -923,8 +925,11 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, err: ext4_ext_drop_refs(path); - if (alloc) + if (free_on_err) { kfree(path); + if (orig_path) + *orig_path = NULL; + } return ERR_PTR(ret); } @@ -1356,7 +1361,7 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path, gb_flags); + &path, gb_flags | EXT4_EX_NOFREE_ON_ERR); if (IS_ERR(path)) err = PTR_ERR(path); } else { @@ -1369,7 +1374,7 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path, gb_flags); + &path, gb_flags | EXT4_EX_NOFREE_ON_ERR); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -2152,7 +2157,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, path = NULL; } - path = ext4_ext_find_extent(inode, block, path, 0); + path = ext4_ext_find_extent(inode, block, &path, 0); if (IS_ERR(path)) { up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); @@ -3313,7 +3318,8 @@ static int ext4_split_extent(handle_t *handle, * result in split of original leaf or extent zeroout. */ ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + path = ext4_ext_find_extent(inode, map->m_lblk, &path, + EXT4_EX_NOFREE_ON_ERR); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -3697,7 +3703,8 @@ static int ext4_convert_initialized_extents(handle_t *handle, if (err < 0) goto out; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + path = ext4_ext_find_extent(inode, map->m_lblk, &path, + EXT4_EX_NOFREE_ON_ERR); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -3769,7 +3776,8 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, if (err < 0) goto out; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + path = ext4_ext_find_extent(inode, map->m_lblk, &path, + EXT4_EX_NOFREE_ON_ERR); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index c8f895b410f6..5e2465a8e4ce 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -37,7 +37,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, int ret = 0; struct ext4_ext_path *path; - path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE); + path = ext4_ext_find_extent(inode, lblock, orig_path, EXT4_EX_NOCACHE); if (IS_ERR(path)) ret = PTR_ERR(path); else if (path[ext_depth(inode)].p_ext == NULL) -- cgit v1.2.3 From e8b83d9303317fb068ad83d87991b610fe990ed5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:35:09 -0400 Subject: ext4: collapse ext4_convert_initialized_extents() The function ext4_convert_initialized_extents() is only called by a single function --- ext4_ext_convert_initalized_extents(). Inline the code and get rid of the unnecessary bits in order to simplify the code. Rename ext4_ext_convert_initalized_extents() to convert_initalized_extents() since it's a static function that is actually only used in a single caller, ext4_ext_map_blocks(). Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 136 +++++++++++++++++++++++------------------------------- 1 file changed, 59 insertions(+), 77 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0ced78c974e2..5fc5e2b6e3a7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3677,67 +3677,6 @@ static int ext4_split_convert_extents(handle_t *handle, return ext4_split_extent(handle, inode, path, map, split_flag, flags); } -static int ext4_convert_initialized_extents(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path *path) -{ - struct ext4_extent *ex; - ext4_lblk_t ee_block; - unsigned int ee_len; - int depth; - int err = 0; - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - - ext_debug("%s: inode %lu, logical" - "block %llu, max_blocks %u\n", __func__, inode->i_ino, - (unsigned long long)ee_block, ee_len); - - if (ee_block != map->m_lblk || ee_len > map->m_len) { - err = ext4_split_convert_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); - if (err < 0) - goto out; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, &path, - EXT4_EX_NOFREE_ON_ERR); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } - depth = ext_depth(inode); - ex = path[depth].p_ext; - if (!ex) { - EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) map->m_lblk); - err = -EIO; - goto out; - } - } - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - /* first mark the extent as unwritten */ - ext4_ext_mark_unwritten(ex); - - /* note: ext4_ext_correct_indexes() isn't needed here because - * borders are not changed - */ - ext4_ext_try_to_merge(handle, inode, path, ex); - - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + path->p_depth); -out: - ext4_ext_show_leaf(inode, path); - return err; -} - - static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, @@ -3974,12 +3913,15 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } static int -ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path *path, int flags, - unsigned int allocated, ext4_fsblk_t newblock) +convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path *path, int flags, + unsigned int allocated, ext4_fsblk_t newblock) { - int ret = 0; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; int err = 0; /* @@ -3989,20 +3931,60 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, if (map->m_len > EXT_UNWRITTEN_MAX_LEN) map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; - ret = ext4_convert_initialized_extents(handle, inode, map, - path); - if (ret >= 0) { - ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, - path, map->m_len); - } else - err = ret; + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("%s: inode %lu, logical" + "block %llu, max_blocks %u\n", __func__, inode->i_ino, + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + err = ext4_split_convert_extents(handle, inode, map, path, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); + if (err < 0) + return err; + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, map->m_lblk, &path, + EXT4_EX_NOFREE_ON_ERR); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + return err; + ext4_ext_show_leaf(inode, path); + + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); + if (err) + return err; map->m_flags |= EXT4_MAP_UNWRITTEN; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; - - return err ? err : allocated; + return allocated; } static int @@ -4342,7 +4324,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { - allocated = ext4_ext_convert_initialized_extent( + allocated = convert_initialized_extent( handle, inode, map, path, flags, allocated, newblock); goto out2; -- cgit v1.2.3 From 4f224b8b7be6856a3ceaf7f9d9c1860d467174ae Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:36:09 -0400 Subject: ext4: drop EXT4_EX_NOFREE_ON_ERR in convert_initialized_extent() Transfer responsibility of freeing struct ext4_ext_path on error to ext4_ext_find_extent(). Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5fc5e2b6e3a7..acb92ac47220 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3915,9 +3915,10 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, int flags, + struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { + struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3946,8 +3947,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, if (err < 0) return err; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, &path, - EXT4_EX_NOFREE_ON_ERR); + path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -4325,8 +4325,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { allocated = convert_initialized_extent( - handle, inode, map, path, flags, - allocated, newblock); + handle, inode, map, &path, + flags, allocated, newblock); goto out2; } else if (!ext4_ext_is_unwritten(ex)) goto out; -- cgit v1.2.3 From dfe5080939ea4686b3414b5d970a9b26733c57a4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:37:09 -0400 Subject: ext4: drop EXT4_EX_NOFREE_ON_ERR from rest of extents handling code Drop EXT4_EX_NOFREE_ON_ERR from ext4_ext_create_new_leaf(), ext4_split_extent(), ext4_convert_unwritten_extents_endio(). This requires fixing all of their callers to potentially ext4_ext_find_extent() to free the struct ext4_ext_path object in case of an error, and there are interlocking dependencies all the way up to ext4_ext_map_blocks(), ext4_swap_extents(), and ext4_ext_remove_space(). Once this is done, we can drop the EXT4_EX_NOFREE_ON_ERR flag since it is no longer necessary. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 +- fs/ext4/extents.c | 112 +++++++++++++++++++++++++++--------------------------- fs/ext4/migrate.c | 2 +- 3 files changed, 59 insertions(+), 58 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 696e51ae02fa..4a5a6b95b2fa 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -582,7 +582,6 @@ enum { */ #define EXT4_EX_NOCACHE 0x0800 #define EXT4_EX_FORCE_CACHE 0x1000 -#define EXT4_EX_NOFREE_ON_ERR 0x2000 /* * Flags used by ext4_free_blocks @@ -2731,7 +2730,7 @@ extern int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2); extern int ext4_ext_insert_extent(handle_t *, struct inode *, - struct ext4_ext_path *, + struct ext4_ext_path **, struct ext4_extent *, int); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path **, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index acb92ac47220..ccdd2afc546e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -98,14 +98,14 @@ static void ext4_extent_block_csum_set(struct inode *inode, static int ext4_split_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags); static int ext4_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags); @@ -293,12 +293,13 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) static inline int ext4_force_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, ext4_lblk_t lblk, + struct ext4_ext_path **ppath, ext4_lblk_t lblk, int nofail) { + struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); - return ext4_split_extent_at(handle, inode, path, lblk, unwritten ? + return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); @@ -861,7 +862,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, struct buffer_head *bh; struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; - short free_on_err = (flags & EXT4_EX_NOFREE_ON_ERR) == 0; int ret; eh = ext_inode_hdr(inode); @@ -873,7 +873,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, GFP_NOFS); if (unlikely(!path)) return ERR_PTR(-ENOMEM); - free_on_err = 1; } path[0].p_hdr = eh; path[0].p_bh = NULL; @@ -925,11 +924,9 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, err: ext4_ext_drop_refs(path); - if (free_on_err) { - kfree(path); - if (orig_path) - *orig_path = NULL; - } + kfree(path); + if (orig_path) + *orig_path = NULL; return ERR_PTR(ret); } @@ -1332,9 +1329,10 @@ out: static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, unsigned int mb_flags, unsigned int gb_flags, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_extent *newext) { + struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; @@ -1361,7 +1359,7 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - &path, gb_flags | EXT4_EX_NOFREE_ON_ERR); + ppath, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { @@ -1374,7 +1372,7 @@ repeat: ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - &path, gb_flags | EXT4_EX_NOFREE_ON_ERR); + ppath, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -1914,9 +1912,10 @@ out: * creating new leaf in the no-space case. */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_extent *newext, int gb_flags) { + struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ @@ -2048,7 +2047,7 @@ prepend: if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) mb_flags = EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, - path, newext); + ppath, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -2878,7 +2877,7 @@ again: * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ - err = ext4_force_split_extent_at(handle, inode, path, + err = ext4_force_split_extent_at(handle, inode, &path, end + 1, 1); if (err < 0) goto out; @@ -3019,12 +3018,13 @@ again: } } out: - ext4_ext_drop_refs(path); - kfree(path); - if (err == -EAGAIN) { + if (path) { + ext4_ext_drop_refs(path); + kfree(path); path = NULL; - goto again; } + if (err == -EAGAIN) + goto again; ext4_journal_stop(handle); return err; @@ -3138,11 +3138,12 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) */ static int ext4_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags) { + struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; @@ -3213,7 +3214,7 @@ static int ext4_split_extent_at(handle_t *handle, if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { @@ -3279,11 +3280,12 @@ fix_extent_len: */ static int ext4_split_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags) { + struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; @@ -3306,7 +3308,7 @@ static int ext4_split_extent(handle_t *handle, EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; - err = ext4_split_extent_at(handle, inode, path, + err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; @@ -3318,8 +3320,7 @@ static int ext4_split_extent(handle_t *handle, * result in split of original leaf or extent zeroout. */ ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, &path, - EXT4_EX_NOFREE_ON_ERR); + path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -3339,7 +3340,7 @@ static int ext4_split_extent(handle_t *handle, split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } - err = ext4_split_extent_at(handle, inode, path, + err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk, split_flag1, flags); if (err) goto out; @@ -3373,9 +3374,10 @@ out: static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, int flags) { + struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; @@ -3599,7 +3601,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } } - allocated = ext4_split_extent(handle, inode, path, + allocated = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (allocated < 0) err = allocated; @@ -3638,9 +3640,10 @@ out: static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, int flags) { + struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; @@ -3674,14 +3677,15 @@ static int ext4_split_convert_extents(handle_t *handle, split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; - return ext4_split_extent(handle, inode, path, map, split_flag, flags); + return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); } static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path **ppath) { + struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3710,17 +3714,14 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_convert_extents(handle, inode, map, path, + err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT); if (err < 0) - goto out; + return err; ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, &path, - EXT4_EX_NOFREE_ON_ERR); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } + path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); + if (IS_ERR(path)) + return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; } @@ -3942,7 +3943,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { - err = ext4_split_convert_extents(handle, inode, map, path, + err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); if (err < 0) return err; @@ -3990,9 +3991,10 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, static int ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, int flags, + struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { + struct ext4_ext_path *path = *ppath; int ret = 0; int err = 0; ext4_io_end_t *io = ext4_inode_aio(inode); @@ -4014,8 +4016,8 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, /* get_block() before submit the IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { - ret = ext4_split_convert_extents(handle, inode, map, - path, flags | EXT4_GET_BLOCKS_CONVERT); + ret = ext4_split_convert_extents(handle, inode, map, ppath, + flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; /* @@ -4033,7 +4035,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { ret = ext4_convert_unwritten_extents_endio(handle, inode, map, - path); + ppath); if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); err = check_eofblocks_fl(handle, inode, map->m_lblk, @@ -4071,7 +4073,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); + ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: @@ -4332,7 +4334,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, goto out; ret = ext4_ext_handle_unwritten_extents( - handle, inode, map, path, flags, + handle, inode, map, &path, flags, allocated, newblock); if (ret < 0) err = ret; @@ -4479,7 +4481,7 @@ got_allocated_blocks: err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); if (!err) - err = ext4_ext_insert_extent(handle, inode, path, + err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); if (!err && set_unwritten) { @@ -5611,18 +5613,18 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, if (e1_blk < lblk1) { split = 1; *erp = ext4_force_split_extent_at(handle, inode1, - path1, lblk1, 0); + &path1, lblk1, 0); if (unlikely(*erp)) goto finish; } if (e2_blk < lblk2) { split = 1; *erp = ext4_force_split_extent_at(handle, inode2, - path2, lblk2, 0); + &path2, lblk2, 0); if (unlikely(*erp)) goto finish; } - /* ext4_split_extent_at() may retult in leaf extent split, + /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) goto repeat; @@ -5637,18 +5639,18 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, if (len != e1_len) { split = 1; *erp = ext4_force_split_extent_at(handle, inode1, - path1, lblk1 + len, 0); + &path1, lblk1 + len, 0); if (unlikely(*erp)) goto finish; } if (len != e2_len) { split = 1; *erp = ext4_force_split_extent_at(handle, inode2, - path2, lblk2 + len, 0); + &path2, lblk2 + len, 0); if (*erp) goto finish; } - /* ext4_split_extent_at() may retult in leaf extent split, + /* ext4_split_extent_at() may result in leaf extent split, * path must to be revalidated. */ if (split) goto repeat; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index d3567f27bae7..aff7bdfdc461 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -81,7 +81,7 @@ static int finish_range(handle_t *handle, struct inode *inode, goto err_out; } } - retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); + retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); if (path) { -- cgit v1.2.3 From 523f431ccfffd3022e80e13befb9594f54b5607e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:38:09 -0400 Subject: ext4: call ext4_ext_drop_refs() from ext4_ext_find_extent() In nearly all of the calls to ext4_ext_find_extent() where the caller is trying to recycle the path object, ext4_ext_drop_refs() gets called to release the buffer heads before the path object gets overwritten. To simplify things for the callers, and to avoid the possibility of a memory leak, make ext4_ext_find_extent() responsible for dropping the buffers. Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ccdd2afc546e..4f4d52398712 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -867,8 +867,10 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_inode_hdr(inode); depth = ext_depth(inode); - /* account possible depth increase */ - if (!path) { + if (path) + ext4_ext_drop_refs(path); + else { + /* account possible depth increase */ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), GFP_NOFS); if (unlikely(!path)) @@ -1356,7 +1358,6 @@ repeat: goto out; /* refill path */ - ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); @@ -1369,7 +1370,6 @@ repeat: goto out; /* refill path */ - ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); @@ -3319,7 +3319,6 @@ static int ext4_split_extent(handle_t *handle, * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ - ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); @@ -3718,7 +3717,6 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, EXT4_GET_BLOCKS_CONVERT); if (err < 0) return err; - ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); @@ -3947,7 +3945,6 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); if (err < 0) return err; - ext4_ext_drop_refs(path); path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); -- cgit v1.2.3 From b7ea89ad0a6b855172158a999d3f5008403f4011 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:39:09 -0400 Subject: ext4: allow a NULL argument to ext4_ext_drop_refs() Teach ext4_ext_drop_refs() to accept a NULL argument, much like kfree(). This allows us to drop a lot of checks to make sure path is non-NULL before calling ext4_ext_drop_refs(). Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 48 ++++++++++++++++++------------------------------ fs/ext4/extents_status.c | 6 ++---- fs/ext4/migrate.c | 6 ++---- fs/ext4/move_extent.c | 20 +++++++------------- 4 files changed, 29 insertions(+), 51 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4f4d52398712..538f9a4d96ff 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -709,9 +709,11 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, void ext4_ext_drop_refs(struct ext4_ext_path *path) { - int depth = path->p_depth; - int i; + int depth, i; + if (!path) + return; + depth = path->p_depth; for (i = 0; i <= depth; i++, path++) if (path->p_bh) { brelse(path->p_bh); @@ -2125,10 +2127,8 @@ merge: err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: - if (npath) { - ext4_ext_drop_refs(npath); - kfree(npath); - } + ext4_ext_drop_refs(npath); + kfree(npath); return err; } @@ -2283,11 +2283,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode, block = es.es_lblk + es.es_len; } - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } - + ext4_ext_drop_refs(path); + kfree(path); return err; } @@ -3018,11 +3015,9 @@ again: } } out: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - path = NULL; - } + ext4_ext_drop_refs(path); + kfree(path); + path = NULL; if (err == -EAGAIN) goto again; ext4_journal_stop(handle); @@ -4611,10 +4606,8 @@ out: map->m_pblk = newblock; map->m_len = allocated; out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); @@ -5693,16 +5686,11 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, count -= len; repeat: - if (path1) { - ext4_ext_drop_refs(path1); - kfree(path1); - path1 = NULL; - } - if (path2) { - ext4_ext_drop_refs(path2); - kfree(path2); - path2 = NULL; - } + ext4_ext_drop_refs(path1); + kfree(path1); + ext4_ext_drop_refs(path2); + kfree(path2); + path1 = path2 = NULL; } return replaced_count; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0b7e28e7eaa4..8ffff966d594 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -499,10 +499,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, } } out: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); } static void ext4_es_insert_extent_ind_check(struct inode *inode, diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index aff7bdfdc461..061c300703c7 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -84,10 +84,8 @@ static int finish_range(handle_t *handle, struct inode *inode, retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); lb->first_pblock = 0; return retval; } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 5e2465a8e4ce..a34c0760276c 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -113,10 +113,8 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, } ret = 1; out: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); return ret; } @@ -711,11 +709,9 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, o_start += cur_len; d_start += cur_len; repeat: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - path = NULL; - } + ext4_ext_drop_refs(path); + kfree(path); + path = NULL; } *moved_len = o_start - orig_blk; if (*moved_len > len) @@ -727,10 +723,8 @@ out: ext4_discard_preallocations(donor_inode); } - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); ext4_inode_resume_unlocked_dio(orig_inode); ext4_inode_resume_unlocked_dio(donor_inode); -- cgit v1.2.3 From 10809df84a4d868db61af621bae3658494165279 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:40:09 -0400 Subject: ext4: teach ext4_ext_find_extent() to realloc path if necessary This adds additional safety in case for some reason we end reusing a path structure which isn't big enough for current depth of the inode. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_extents.h | 1 + fs/ext4/extents.c | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index a867f5ca9991..3c9381547094 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -123,6 +123,7 @@ find_ext4_extent_tail(struct ext4_extent_header *eh) struct ext4_ext_path { ext4_fsblk_t p_block; __u16 p_depth; + __u16 p_maxdepth; struct ext4_extent *p_ext; struct ext4_extent_idx *p_idx; struct ext4_extent_header *p_hdr; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 538f9a4d96ff..c94c7480053e 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -869,14 +869,20 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_inode_hdr(inode); depth = ext_depth(inode); - if (path) + if (path) { ext4_ext_drop_refs(path); - else { + if (depth > path[0].p_maxdepth) { + kfree(path); + *orig_path = path = NULL; + } + } + if (!path) { /* account possible depth increase */ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), GFP_NOFS); if (unlikely(!path)) return ERR_PTR(-ENOMEM); + path[0].p_maxdepth = depth + 1; } path[0].p_hdr = eh; path[0].p_bh = NULL; @@ -1820,6 +1826,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, sizeof(struct ext4_extent_idx); s += sizeof(struct ext4_extent_header); + path[1].p_maxdepth = path[0].p_maxdepth; memcpy(path[0].p_hdr, path[1].p_hdr, s); path[0].p_depth = 0; path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + @@ -2150,12 +2157,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode, /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); - if (path && ext_depth(inode) != depth) { - /* depth was changed. we have to realloc path */ - kfree(path); - path = NULL; - } - path = ext4_ext_find_extent(inode, block, &path, 0); if (IS_ERR(path)) { up_read(&EXT4_I(inode)->i_data_sem); @@ -2173,7 +2174,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode, } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); - ext4_ext_drop_refs(path); flags = 0; exists = 0; @@ -2897,7 +2897,7 @@ again: ext4_journal_stop(handle); return -ENOMEM; } - path[0].p_depth = depth; + path[0].p_maxdepth = path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); i = 0; -- cgit v1.2.3 From ee4bd0d963b75cbad9bfb59b547146671c7a655a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:41:09 -0400 Subject: ext4: reuse path object in ext4_ext_shift_extents() Now that the semantics of ext4_ext_find_extent() are much cleaner, it's safe and more efficient to reuse the path object across the multiple calls to ext4_ext_find_extent() in ext4_ext_shift_extents(). Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c94c7480053e..22828e44a70d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5306,26 +5306,21 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, depth = path->p_depth; extent = path[depth].p_ext; - if (!extent) { - ext4_ext_drop_refs(path); - kfree(path); - return ret; - } + if (!extent) + goto out; stop_block = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); - ext4_ext_drop_refs(path); - kfree(path); /* Nothing to shift, if hole is at the end of file */ if (start >= stop_block) - return ret; + goto out; /* * Don't start shifting extents until we make sure the hole is big * enough to accomodate the shift. */ - path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + path = ext4_ext_find_extent(inode, start - 1, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5338,8 +5333,6 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ex_start = 0; ex_end = 0; } - ext4_ext_drop_refs(path); - kfree(path); if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) @@ -5347,7 +5340,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, /* Its safe to start updating extents */ while (start < stop_block) { - path = ext4_ext_find_extent(inode, start, NULL, 0); + path = ext4_ext_find_extent(inode, start, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5363,19 +5356,17 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, path[depth].p_ext++; } else { start = ext4_ext_next_allocated_block(path); - ext4_ext_drop_refs(path); - kfree(path); continue; } } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, &start); - ext4_ext_drop_refs(path); - kfree(path); if (ret) break; } - +out: + ext4_ext_drop_refs(path); + kfree(path); return ret; } -- cgit v1.2.3 From 3bdf14b4d7a3a7416577e9f9f421dbf29b5b6747 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:42:09 -0400 Subject: ext4: reuse path object in ext4_move_extents() Reuse the path object in ext4_move_extents() so we don't unnecessarily free and reallocate it. Also clean up the get_ext_path() wrapper so that it has the same semantics of freeing the path object on error as ext4_ext_find_extent(). Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index a34c0760276c..7bf970dd61f5 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -32,20 +32,21 @@ */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **orig_path) + struct ext4_ext_path **ppath) { - int ret = 0; struct ext4_ext_path *path; - path = ext4_ext_find_extent(inode, lblock, orig_path, EXT4_EX_NOCACHE); + path = ext4_ext_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); if (IS_ERR(path)) - ret = PTR_ERR(path); - else if (path[ext_depth(inode)].p_ext == NULL) - ret = -ENODATA; - else - *orig_path = path; - - return ret; + return PTR_ERR(path); + if (path[ext_depth(inode)].p_ext == NULL) { + ext4_ext_drop_refs(path); + kfree(path); + *ppath = NULL; + return -ENODATA; + } + *ppath = path; + return 0; } /** @@ -667,7 +668,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, } d_start += next_blk - o_start; o_start = next_blk; - goto repeat; + continue; /* Check hole after the start pos */ } else if (cur_blk > o_start) { /* Skip hole */ @@ -708,10 +709,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, break; o_start += cur_len; d_start += cur_len; - repeat: - ext4_ext_drop_refs(path); - kfree(path); - path = NULL; } *moved_len = o_start - orig_blk; if (*moved_len > len) -- cgit v1.2.3 From ed8a1a766af7371bfbe41857a3a11496b4165143 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 14:43:09 -0400 Subject: ext4: rename ext4_ext_find_extent() to ext4_find_extent() Make the function name less redundant. Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 8 ++++---- fs/ext4/extents.c | 38 +++++++++++++++++++------------------- fs/ext4/extents_status.c | 2 +- fs/ext4/migrate.c | 3 +-- fs/ext4/move_extent.c | 4 ++-- 5 files changed, 27 insertions(+), 28 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4a5a6b95b2fa..c07f43f8eb93 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -573,7 +573,7 @@ enum { /* * The bit position of these flags must not overlap with any of the - * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), * read_extent_tree_block(), ext4_split_extent_at(), * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be @@ -2732,9 +2732,9 @@ extern int ext4_can_extents_be_merged(struct inode *inode, extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path **, struct ext4_extent *, int); -extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path **, - int flags); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern int ext4_find_delalloc_range(struct inode *inode, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 22828e44a70d..3ac1686efff8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -857,8 +857,8 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) } struct ext4_ext_path * -ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path **orig_path, int flags) +ext4_find_extent(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_path **orig_path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; @@ -1366,7 +1366,7 @@ repeat: goto out; /* refill path */ - path = ext4_ext_find_extent(inode, + path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); if (IS_ERR(path)) @@ -1378,7 +1378,7 @@ repeat: goto out; /* refill path */ - path = ext4_ext_find_extent(inode, + path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), ppath, gb_flags); if (IS_ERR(path)) { @@ -1951,7 +1951,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* * Try to see whether we should rather test the extent on * right from ex, or from the left of ex. This is because - * ext4_ext_find_extent() can return either extent on the + * ext4_find_extent() can return either extent on the * left, or on the right from the searched position. This * will make merging more effective. */ @@ -2034,7 +2034,7 @@ prepend: if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_ext_find_extent(inode, next, NULL, 0); + npath = ext4_find_extent(inode, next, NULL, 0); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); @@ -2157,7 +2157,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); - path = ext4_ext_find_extent(inode, block, &path, 0); + path = ext4_find_extent(inode, block, &path, 0); if (IS_ERR(path)) { up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); @@ -2840,7 +2840,7 @@ again: ext4_lblk_t ee_block; /* find extent for this block */ - path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -3314,7 +3314,7 @@ static int ext4_split_extent(handle_t *handle, * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ - path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -3712,7 +3712,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, EXT4_GET_BLOCKS_CONVERT); if (err < 0) return err; - path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -3940,7 +3940,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); if (err < 0) return err; - path = ext4_ext_find_extent(inode, map->m_lblk, ppath, 0); + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -4266,7 +4266,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); + path = ext4_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -4278,7 +4278,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * consistent leaf must not be empty; * this situation is possible, though, _during_ tree modification; - * this is why assert can't be put in ext4_ext_find_extent() + * this is why assert can't be put in ext4_find_extent() */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " @@ -4363,7 +4363,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * If we are doing bigalloc, check to see if the extent returned - * by ext4_ext_find_extent() implies a cluster we can use. + * by ext4_find_extent() implies a cluster we can use. */ if (cluster_offset && ex && get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { @@ -5300,7 +5300,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ext4_lblk_t ex_start, ex_end; /* Let path point to the last extent */ - path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); if (IS_ERR(path)) return PTR_ERR(path); @@ -5320,7 +5320,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * Don't start shifting extents until we make sure the hole is big * enough to accomodate the shift. */ - path = ext4_ext_find_extent(inode, start - 1, &path, 0); + path = ext4_find_extent(inode, start - 1, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5340,7 +5340,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, /* Its safe to start updating extents */ while (start < stop_block) { - path = ext4_ext_find_extent(inode, start, &path, 0); + path = ext4_find_extent(inode, start, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5537,7 +5537,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, int e1_len, e2_len, len; int split = 0; - path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); + path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); if (unlikely(IS_ERR(path1))) { *erp = PTR_ERR(path1); path1 = NULL; @@ -5545,7 +5545,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, count = 0; goto repeat; } - path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); + path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); if (unlikely(IS_ERR(path2))) { *erp = PTR_ERR(path2); path2 = NULL; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 8ffff966d594..bdd400c81533 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -426,7 +426,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, unsigned short ee_len; int depth, ee_status, es_status; - path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 061c300703c7..a432634f2e6a 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -41,8 +41,7 @@ static int finish_range(handle_t *handle, struct inode *inode, ext4_ext_store_pblock(&newext, lb->first_pblock); /* Locking only for convinience since we are operating on temp inode */ down_write(&EXT4_I(inode)->i_data_sem); - path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); - + path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); path = NULL; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 7bf970dd61f5..5d7806390102 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -27,7 +27,7 @@ * @lblock: logical block number to find an extent path * @path: pointer to an extent path pointer (for output) * - * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value + * ext4_find_extent wrapper. Return 0 on success, or a negative error value * on failure. */ static inline int @@ -36,7 +36,7 @@ get_ext_path(struct inode *inode, ext4_lblk_t lblock, { struct ext4_ext_path *path; - path = ext4_ext_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); if (IS_ERR(path)) return PTR_ERR(path); if (path[ext_depth(inode)].p_ext == NULL) { -- cgit v1.2.3 From be1158cc615fd723552f0d9912087423c7cadda5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 21:19:01 -0400 Subject: jbd2: fold __process_buffer() into jbd2_log_do_checkpoint() __process_buffer() is only called by jbd2_log_do_checkpoint(), and it had a very complex locking protocol where it would be called with the j_list_lock, and sometimes exit with the lock held (if the return code was 0), or release the lock. This was confusing both to humans and to smatch (which erronously complained that the lock was taken twice). Folding __process_buffer() to the caller allows us to simplify the control flow, making the resulting function easier to read and reason about, and dropping the compiled size of fs/jbd2/checkpoint.c by 150 bytes (over 4% of the text size). Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/jbd2/checkpoint.c | 195 ++++++++++++++++++++++----------------------------- 1 file changed, 84 insertions(+), 111 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 7f34f4716165..f1507e5b7c9a 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -254,81 +254,6 @@ __flush_batch(journal_t *journal, int *batch_count) *batch_count = 0; } -/* - * Try to flush one buffer from the checkpoint list to disk. - * - * Return 1 if something happened which requires us to abort the current - * scan of the checkpoint list. Return <0 if the buffer has failed to - * be written out. - * - * Called with j_list_lock held and drops it if 1 is returned - */ -static int __process_buffer(journal_t *journal, struct journal_head *jh, - int *batch_count, transaction_t *transaction) -{ - struct buffer_head *bh = jh2bh(jh); - int ret = 0; - - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - ret = 1; - } else if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; - - transaction->t_chp_stats.cs_forced_to_close++; - spin_unlock(&journal->j_list_lock); - if (unlikely(journal->j_flags & JBD2_UNMOUNT)) - /* - * The journal thread is dead; so starting and - * waiting for a commit to finish will cause - * us to wait for a _very_ long time. - */ - printk(KERN_ERR "JBD2: %s: " - "Waiting for Godot: block %llu\n", - journal->j_devname, - (unsigned long long) bh->b_blocknr); - jbd2_log_start_commit(journal, tid); - jbd2_log_wait_commit(journal, tid); - ret = 1; - } else if (!buffer_dirty(bh)) { - ret = 1; - if (unlikely(buffer_write_io_error(bh))) - ret = -EIO; - get_bh(bh); - BUFFER_TRACE(bh, "remove from checkpoint"); - __jbd2_journal_remove_checkpoint(jh); - spin_unlock(&journal->j_list_lock); - __brelse(bh); - } else { - /* - * Important: we are about to write the buffer, and - * possibly block, while still holding the journal lock. - * We cannot afford to let the transaction logic start - * messing around with this buffer before we write it to - * disk, as that would break recoverability. - */ - BUFFER_TRACE(bh, "queue"); - get_bh(bh); - J_ASSERT_BH(bh, !buffer_jwrite(bh)); - journal->j_chkpt_bhs[*batch_count] = bh; - __buffer_relink_io(jh); - transaction->t_chp_stats.cs_written++; - (*batch_count)++; - if (*batch_count == JBD2_NR_BATCH) { - spin_unlock(&journal->j_list_lock); - __flush_batch(journal, batch_count); - ret = 1; - } - } - return ret; -} - /* * Perform an actual checkpoint. We take the first transaction on the * list of transactions to be checkpointed and send all its buffers @@ -339,9 +264,11 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, */ int jbd2_log_do_checkpoint(journal_t *journal) { - transaction_t *transaction; - tid_t this_tid; - int result; + struct journal_head *jh; + struct buffer_head *bh; + transaction_t *transaction; + tid_t this_tid; + int err, result, batch_count = 0; jbd_debug(1, "Start checkpoint\n"); @@ -374,46 +301,92 @@ restart: * done (maybe it's a new transaction, but it fell at the same * address). */ - if (journal->j_checkpoint_transactions == transaction && - transaction->t_tid == this_tid) { - int batch_count = 0; - struct journal_head *jh; - int retry = 0, err; - - while (!retry && transaction->t_checkpoint_list) { - jh = transaction->t_checkpoint_list; - retry = __process_buffer(journal, jh, &batch_count, - transaction); - if (retry < 0 && !result) - result = retry; - if (!retry && (need_resched() || - spin_needbreak(&journal->j_list_lock))) { - spin_unlock(&journal->j_list_lock); - retry = 1; - break; - } - } + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + goto out; - if (batch_count) { - if (!retry) { - spin_unlock(&journal->j_list_lock); - retry = 1; - } - __flush_batch(journal, &batch_count); + /* checkpoint all of the transaction's buffers */ + while (transaction->t_checkpoint_list) { + jh = transaction->t_checkpoint_list; + bh = jh2bh(jh); + + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + get_bh(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + goto retry; } + if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; - if (retry) { - spin_lock(&journal->j_list_lock); - goto restart; + transaction->t_chp_stats.cs_forced_to_close++; + spin_unlock(&journal->j_list_lock); + if (unlikely(journal->j_flags & JBD2_UNMOUNT)) + /* + * The journal thread is dead; so + * starting and waiting for a commit + * to finish will cause us to wait for + * a _very_ long time. + */ + printk(KERN_ERR + "JBD2: %s: Waiting for Godot: block %llu\n", + journal->j_devname, (unsigned long long) bh->b_blocknr); + + jbd2_log_start_commit(journal, tid); + jbd2_log_wait_commit(journal, tid); + goto retry; + } + if (!buffer_dirty(bh)) { + if (unlikely(buffer_write_io_error(bh)) && !result) + result = -EIO; + get_bh(bh); + BUFFER_TRACE(bh, "remove from checkpoint"); + __jbd2_journal_remove_checkpoint(jh); + spin_unlock(&journal->j_list_lock); + __brelse(bh); + goto retry; } /* - * Now we have cleaned up the first transaction's checkpoint - * list. Let's clean up the second one + * Important: we are about to write the buffer, and + * possibly block, while still holding the journal + * lock. We cannot afford to let the transaction + * logic start messing around with this buffer before + * we write it to disk, as that would break + * recoverability. */ - err = __wait_cp_io(journal, transaction); - if (!result) - result = err; + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + journal->j_chkpt_bhs[batch_count++] = bh; + __buffer_relink_io(jh); + transaction->t_chp_stats.cs_written++; + if ((batch_count == JBD2_NR_BATCH) || + need_resched() || + spin_needbreak(&journal->j_list_lock)) + goto unlock_and_flush; } + + if (batch_count) { + unlock_and_flush: + spin_unlock(&journal->j_list_lock); + retry: + if (batch_count) + __flush_batch(journal, &batch_count); + spin_lock(&journal->j_list_lock); + goto restart; + } + + /* + * Now we issued all of the transaction's buffers, let's deal + * with the buffers that are out for I/O. + */ + err = __wait_cp_io(journal, transaction); + if (!result) + result = err; out: spin_unlock(&journal->j_list_lock); if (result < 0) -- cgit v1.2.3 From 88fe1acb5bedfcba5f42fcdf165493ee587ba643 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 1 Sep 2014 21:26:09 -0400 Subject: jbd2: fold __wait_cp_io into jbd2_log_do_checkpoint() __wait_cp_io() is only called by jbd2_log_do_checkpoint(). Fold it in to make it a bit easier to understand. Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 87 +++++++++++++++++++--------------------------------- 1 file changed, 31 insertions(+), 56 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index f1507e5b7c9a..18c7a8d3da13 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -183,58 +183,6 @@ void __jbd2_log_wait_for_space(journal_t *journal) } } -/* - * Clean up transaction's list of buffers submitted for io. - * We wait for any pending IO to complete and remove any clean - * buffers. Note that we take the buffers in the opposite ordering - * from the one in which they were submitted for IO. - * - * Return 0 on success, and return <0 if some buffers have failed - * to be written out. - * - * Called with j_list_lock held. - */ -static int __wait_cp_io(journal_t *journal, transaction_t *transaction) -{ - struct journal_head *jh; - struct buffer_head *bh; - tid_t this_tid; - int released = 0; - int ret = 0; - - this_tid = transaction->t_tid; -restart: - /* Did somebody clean up the transaction in the meanwhile? */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - return ret; - while (!released && transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart; - } - if (unlikely(buffer_write_io_error(bh))) - ret = -EIO; - - /* - * Now in whatever state the buffer currently is, we know that - * it has been written out and so we can drop it from the list - */ - released = __jbd2_journal_remove_checkpoint(jh); - __brelse(bh); - } - - return ret; -} - static void __flush_batch(journal_t *journal, int *batch_count) { @@ -268,7 +216,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) struct buffer_head *bh; transaction_t *transaction; tid_t this_tid; - int err, result, batch_count = 0; + int result, batch_count = 0, done = 0; jbd_debug(1, "Start checkpoint\n"); @@ -384,9 +332,36 @@ restart: * Now we issued all of the transaction's buffers, let's deal * with the buffers that are out for I/O. */ - err = __wait_cp_io(journal, transaction); - if (!result) - result = err; +restart2: + /* Did somebody clean up the transaction in the meanwhile? */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + goto out; + + while (!done && transaction->t_checkpoint_io_list) { + jh = transaction->t_checkpoint_io_list; + bh = jh2bh(jh); + get_bh(bh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + spin_lock(&journal->j_list_lock); + goto restart2; + } + if (unlikely(buffer_write_io_error(bh)) && !result) + result = -EIO; + + /* + * Now in whatever state the buffer currently is, we + * know that it has been written out and so we can + * drop it from the list + */ + done = __jbd2_journal_remove_checkpoint(jh); + __brelse(bh); + } out: spin_unlock(&journal->j_list_lock); if (result < 0) -- cgit v1.2.3 From 45f1a9c3f63db3d4562c16062a51740801fbd88c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 1 Sep 2014 21:34:09 -0400 Subject: ext4: enable block_validity by default Enable by default the block_validity feature, which checks for collisions between newly allocated blocks and critical system metadata. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 896e452b739d..7194a51eb217 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3519,8 +3519,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); - if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) - set_opt(sb, BLOCK_VALIDITY); + /* block_validity enabled by default; disable with noblock_validity */ + set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); -- cgit v1.2.3 From d91bd2c1d78d8d22f9f721aae84650a08239b509 Mon Sep 17 00:00:00 2001 From: Seunghun Lee Date: Mon, 1 Sep 2014 22:15:30 -0400 Subject: ext4: fix comments about get_blocks get_blocks is renamed to get_block. Signed-off-by: Seunghun Lee Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cc95dca5cb8a..4a16b0cc02de 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -590,7 +590,7 @@ found: /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take - * the write lock of i_data_sem, and call get_blocks() + * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); @@ -1529,7 +1529,7 @@ out_unlock: } /* - * This is a special get_blocks_t callback which is used by + * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * -- cgit v1.2.3 From e963bb1de415ab06693357336c1bec664753e1e2 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 1 Sep 2014 22:22:13 -0400 Subject: ext4: improve extents status tree trace point This commit improves the trace point of extents status tree. We rename trace_ext4_es_shrink_enter in ext4_es_count() because it is also used in ext4_es_scan() and we can not identify them from the result. Further this commit fixes a variable name in trace point in order to keep consistency with others. Cc: Andreas Dilger Cc: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Zheng Liu Signed-off-by: Theodore Ts'o --- fs/ext4/extents_status.c | 6 +++--- include/trace/events/ext4.h | 28 ++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index bdd400c81533..95da65c60d83 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1019,7 +1019,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); - trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr); + trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } @@ -1032,14 +1032,14 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, int ret, nr_shrunk; ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); - trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) return ret; nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); - trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); + trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index d4f70a7fe876..849aaba75dc8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2369,7 +2369,7 @@ TRACE_EVENT(ext4_es_lookup_extent_exit, show_extent_status(__entry->found ? __entry->status : 0)) ); -TRACE_EVENT(ext4_es_shrink_enter, +DECLARE_EVENT_CLASS(ext4__es_shrink_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt), @@ -2391,26 +2391,38 @@ TRACE_EVENT(ext4_es_shrink_enter, __entry->nr_to_scan, __entry->cache_cnt) ); -TRACE_EVENT(ext4_es_shrink_exit, - TP_PROTO(struct super_block *sb, int shrunk_nr, int cache_cnt), +DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count, + TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), + + TP_ARGS(sb, nr_to_scan, cache_cnt) +); + +DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter, + TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), + + TP_ARGS(sb, nr_to_scan, cache_cnt) +); + +TRACE_EVENT(ext4_es_shrink_scan_exit, + TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt), - TP_ARGS(sb, shrunk_nr, cache_cnt), + TP_ARGS(sb, nr_shrunk, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) - __field( int, shrunk_nr ) + __field( int, nr_shrunk ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; - __entry->shrunk_nr = shrunk_nr; + __entry->nr_shrunk = nr_shrunk; __entry->cache_cnt = cache_cnt; ), - TP_printk("dev %d,%d shrunk_nr %d cache_cnt %d", + TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->shrunk_nr, __entry->cache_cnt) + __entry->nr_shrunk, __entry->cache_cnt) ); TRACE_EVENT(ext4_collapse_range, -- cgit v1.2.3 From eb68d0e2fc5a4e5c06324ea5f485fccbae626d05 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 1 Sep 2014 22:26:49 -0400 Subject: ext4: track extent status tree shrinker delay statictics This commit adds some statictics in extent status tree shrinker. The purpose to add these is that we want to collect more details when we encounter a stall caused by extent status tree shrinker. Here we count the following statictics: stats: the number of all objects on all extent status trees the number of reclaimable objects on lru list cache hits/misses the last sorted interval the number of inodes on lru list average: scan time for shrinking some objects the number of shrunk objects maximum: the inode that has max nr. of objects on lru list the maximum scan time for shrinking some objects The output looks like below: $ cat /proc/fs/ext4/sda1/es_shrinker_info stats: 28228 objects 6341 reclaimable objects 5281/631 cache hits/misses 586 ms last sorted interval 250 inodes on lru list average: 153 us scan time 128 shrunk objects maximum: 255 inode (255 objects, 198 reclaimable) 125723 us max scan time If the lru list has never been sorted, the following line will not be printed: 586ms last sorted interval If there is an empty lru list, the following lines also will not be printed: 250 inodes on lru list ... maximum: 255 inode (255 objects, 198 reclaimable) 0 us max scan time Meanwhile in this commit a new trace point is defined to print some details in __ext4_es_shrink(). Cc: Andreas Dilger Cc: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Zheng Liu Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 4 +- fs/ext4/extents_status.c | 186 +++++++++++++++++++++++++++++++++++++++++--- fs/ext4/extents_status.h | 13 +++- fs/ext4/super.c | 11 +-- include/trace/events/ext4.h | 31 ++++++++ 5 files changed, 224 insertions(+), 21 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c07f43f8eb93..00fd822ac6e4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -891,6 +891,7 @@ struct ext4_inode_info { struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; struct list_head i_es_lru; + unsigned int i_es_all_nr; /* protected by i_es_lock */ unsigned int i_es_lru_nr; /* protected by i_es_lock */ unsigned long i_touch_when; /* jiffies of last accessing */ @@ -1331,8 +1332,7 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; struct list_head s_es_lru; - unsigned long s_es_last_sorted; - struct percpu_counter s_extent_cache_cnt; + struct ext4_es_stats s_es_stats; struct mb_cache *s_mb_cache; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 95da65c60d83..09fd57667262 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -11,6 +11,8 @@ */ #include #include +#include +#include #include "ext4.h" #include "extents_status.h" @@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, */ if (!ext4_es_is_delayed(es)) { EXT4_I(inode)->i_es_lru_nr++; - percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_lru_cnt); } + EXT4_I(inode)->i_es_all_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + return es; } static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) { + EXT4_I(inode)->i_es_all_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + /* Decrease the lru counter when this es is not delayed */ if (!ext4_es_is_delayed(es)) { BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); EXT4_I(inode)->i_es_lru_nr--; - percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_lru_cnt); } kmem_cache_free(ext4_es_cachep, es); @@ -729,6 +739,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es) { struct ext4_es_tree *tree; + struct ext4_es_stats *stats; struct extent_status *es1 = NULL; struct rb_node *node; int found = 0; @@ -765,11 +776,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, } out: + stats = &EXT4_SB(inode->i_sb)->s_es_stats; if (found) { BUG_ON(!es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; + stats->es_stats_cache_hits++; + } else { + stats->es_stats_cache_misses++; } read_unlock(&EXT4_I(inode)->i_es_lock); @@ -931,11 +946,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei) { struct ext4_inode_info *ei; + struct ext4_es_stats *es_stats; struct list_head *cur, *tmp; LIST_HEAD(skipped); + ktime_t start_time; + u64 scan_time; int nr_shrunk = 0; int retried = 0, skip_precached = 1, nr_skipped = 0; + es_stats = &sbi->s_es_stats; + start_time = ktime_get(); spin_lock(&sbi->s_es_lru_lock); retry: @@ -946,7 +966,8 @@ retry: * If we have already reclaimed all extents from extent * status tree, just stop the loop immediately. */ - if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) + if (percpu_counter_read_positive( + &es_stats->es_stats_lru_cnt) == 0) break; ei = list_entry(cur, struct ext4_inode_info, i_es_lru); @@ -956,7 +977,7 @@ retry: * time. Normally we try hard to avoid shrinking * precached inodes, but we will as a last resort. */ - if ((sbi->s_es_last_sorted < ei->i_touch_when) || + if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || (skip_precached && ext4_test_inode_state(&ei->vfs_inode, EXT4_STATE_EXT_PRECACHED))) { nr_skipped++; @@ -990,7 +1011,7 @@ retry: if ((nr_shrunk == 0) && nr_skipped && !retried) { retried++; list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - sbi->s_es_last_sorted = jiffies; + es_stats->es_stats_last_sorted = jiffies; ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); /* @@ -1008,6 +1029,22 @@ retry: if (locked_ei && nr_shrunk == 0) nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + if (likely(es_stats->es_stats_scan_time)) + es_stats->es_stats_scan_time = (scan_time + + es_stats->es_stats_scan_time*3) / 4; + else + es_stats->es_stats_scan_time = scan_time; + if (scan_time > es_stats->es_stats_max_scan_time) + es_stats->es_stats_max_scan_time = scan_time; + if (likely(es_stats->es_stats_shrunk)) + es_stats->es_stats_shrunk = (nr_shrunk + + es_stats->es_stats_shrunk*3) / 4; + else + es_stats->es_stats_shrunk = nr_shrunk; + + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, + nr_skipped, retried); return nr_shrunk; } @@ -1018,7 +1055,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, struct ext4_sb_info *sbi; sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); - nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } @@ -1031,7 +1068,7 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) @@ -1043,19 +1080,148 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, return nr_shrunk; } -void ext4_es_register_shrinker(struct ext4_sb_info *sbi) +static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? NULL : SEQ_START_TOKEN; +} + +static void * +ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return NULL; +} + +static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) +{ + struct ext4_sb_info *sbi = seq->private; + struct ext4_es_stats *es_stats = &sbi->s_es_stats; + struct ext4_inode_info *ei, *max = NULL; + unsigned int inode_cnt = 0; + + if (v != SEQ_START_TOKEN) + return 0; + + /* here we just find an inode that has the max nr. of objects */ + spin_lock(&sbi->s_es_lru_lock); + list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { + inode_cnt++; + if (max && max->i_es_all_nr < ei->i_es_all_nr) + max = ei; + else if (!max) + max = ei; + } + spin_unlock(&sbi->s_es_lru_lock); + + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), + percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); + seq_printf(seq, " %lu/%lu cache hits/misses\n", + es_stats->es_stats_cache_hits, + es_stats->es_stats_cache_misses); + if (es_stats->es_stats_last_sorted != 0) + seq_printf(seq, " %u ms last sorted interval\n", + jiffies_to_msecs(jiffies - + es_stats->es_stats_last_sorted)); + if (inode_cnt) + seq_printf(seq, " %d inodes on lru list\n", inode_cnt); + + seq_printf(seq, "average:\n %llu us scan time\n", + div_u64(es_stats->es_stats_scan_time, 1000)); + seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); + if (inode_cnt) + seq_printf(seq, + "maximum:\n %lu inode (%u objects, %u reclaimable)\n" + " %llu us max scan time\n", + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, + div_u64(es_stats->es_stats_max_scan_time, 1000)); + + return 0; +} + +static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations ext4_es_seq_shrinker_info_ops = { + .start = ext4_es_seq_shrinker_info_start, + .next = ext4_es_seq_shrinker_info_next, + .stop = ext4_es_seq_shrinker_info_stop, + .show = ext4_es_seq_shrinker_info_show, +}; + +static int +ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = PDE_DATA(inode); + } + + return ret; +} + +static int +ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} + +static const struct file_operations ext4_es_seq_shrinker_info_fops = { + .owner = THIS_MODULE, + .open = ext4_es_seq_shrinker_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ext4_es_seq_shrinker_info_release, +}; + +int ext4_es_register_shrinker(struct ext4_sb_info *sbi) { + int err; + INIT_LIST_HEAD(&sbi->s_es_lru); spin_lock_init(&sbi->s_es_lru_lock); - sbi->s_es_last_sorted = 0; + sbi->s_es_stats.es_stats_last_sorted = 0; + sbi->s_es_stats.es_stats_shrunk = 0; + sbi->s_es_stats.es_stats_cache_hits = 0; + sbi->s_es_stats.es_stats_cache_misses = 0; + sbi->s_es_stats.es_stats_scan_time = 0; + sbi->s_es_stats.es_stats_max_scan_time = 0; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0); + if (err) + return err; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0); + if (err) + goto err1; + sbi->s_es_shrinker.scan_objects = ext4_es_scan; sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - register_shrinker(&sbi->s_es_shrinker); + err = register_shrinker(&sbi->s_es_shrinker); + if (err) + goto err2; + + if (sbi->s_proc) + proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, + &ext4_es_seq_shrinker_info_fops, sbi); + + return 0; + +err2: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); +err1: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + return err; } void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { + if (sbi->s_proc) + remove_proc_entry("es_shrinker_info", sbi->s_proc); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); unregister_shrinker(&sbi->s_es_shrinker); } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f1b62a419920..efd5f970b501 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -64,6 +64,17 @@ struct ext4_es_tree { struct extent_status *cache_es; /* recently accessed extent */ }; +struct ext4_es_stats { + unsigned long es_stats_last_sorted; + unsigned long es_stats_shrunk; + unsigned long es_stats_cache_hits; + unsigned long es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_lru_cnt; +}; + extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); @@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, (pb & ~ES_MASK)); } -extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_lru_add(struct inode *inode); extern void ext4_es_lru_del(struct inode *inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7194a51eb217..487c65b8cff0 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -820,7 +820,6 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_counter_destroy(&sbi->s_extent_cache_cnt); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -885,6 +884,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_lru); + ei->i_es_all_nr = 0; ei->i_es_lru_nr = 0; ei->i_touch_when = 0; ei->i_reserved_data_blocks = 0; @@ -3890,12 +3890,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.data = (unsigned long) sb; /* Register extent status tree shrinker */ - ext4_es_register_shrinker(sbi); - - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); + if (ext4_es_register_shrinker(sbi)) goto failed_mount3; - } sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; @@ -4225,10 +4221,9 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } -failed_mount3: ext4_es_unregister_shrinker(sbi); +failed_mount3: del_timer_sync(&sbi->s_err_report); - percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 849aaba75dc8..ff4bd1b35246 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2450,6 +2450,37 @@ TRACE_EVENT(ext4_collapse_range, __entry->offset, __entry->len) ); +TRACE_EVENT(ext4_es_shrink, + TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, + int skip_precached, int nr_skipped, int retried), + + TP_ARGS(sb, nr_shrunk, scan_time, skip_precached, nr_skipped, retried), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, nr_shrunk ) + __field( unsigned long long, scan_time ) + __field( int, skip_precached ) + __field( int, nr_skipped ) + __field( int, retried ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->nr_shrunk = nr_shrunk; + __entry->scan_time = div_u64(scan_time, 1000); + __entry->skip_precached = skip_precached; + __entry->nr_skipped = nr_skipped; + __entry->retried = retried; + ), + + TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu skip_precached %d " + "nr_skipped %d retried %d", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, + __entry->scan_time, __entry->skip_precached, + __entry->nr_skipped, __entry->retried) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From a521100231f816f8cdd9c8e77da14ff1e42c2b17 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 4 Sep 2014 18:06:25 -0400 Subject: ext4: pass allocation_request struct to ext4_(alloc,splice)_branch Instead of initializing the allocation_request structure in ext4_alloc_branch(), set it up in ext4_ind_map_blocks(), and then pass it to ext4_alloc_branch() and ext4_splice_branch(). This allows ext4_ind_map_blocks to pass flags in the allocation request structure without having to add Yet Another argument to ext4_alloc_branch(). Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/indirect.c | 82 +++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 44 deletions(-) diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index e75f840000a0..69af0cd64724 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -318,34 +318,22 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain * as described above and return 0. */ -static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, int indirect_blks, - int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) +static int ext4_alloc_branch(handle_t *handle, + struct ext4_allocation_request *ar, + int indirect_blks, ext4_lblk_t *offsets, + Indirect *branch) { - struct ext4_allocation_request ar; struct buffer_head * bh; ext4_fsblk_t b, new_blocks[4]; __le32 *p; int i, j, err, len = 1; - /* - * Set up for the direct block allocation - */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.len = *blks; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - ar.flags = EXT4_MB_HINT_DATA; - for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { - ar.goal = goal; - new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); + new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); } else - goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, - goal, 0, NULL, &err); + ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, + ar->inode, ar->goal, 0, NULL, &err); if (err) { i--; goto failed; @@ -354,7 +342,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, if (i == 0) continue; - bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); + bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; @@ -372,7 +360,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, b = new_blocks[i]; if (i == indirect_blks) - len = ar.len; + len = ar->len; for (j = 0; j < len; j++) *p++ = cpu_to_le32(b++); @@ -381,11 +369,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); + err = ext4_handle_dirty_metadata(handle, ar->inode, bh); if (err) goto failed; } - *blks = ar.len; return 0; failed: for (; i >= 0; i--) { @@ -396,10 +383,10 @@ failed: * existing before ext4_alloc_branch() was called. */ if (i > 0 && i != indirect_blks && branch[i].bh) - ext4_forget(handle, 1, inode, branch[i].bh, + ext4_forget(handle, 1, ar->inode, branch[i].bh, branch[i].bh->b_blocknr); - ext4_free_blocks(handle, inode, NULL, new_blocks[i], - (i == indirect_blks) ? ar.len : 1, 0); + ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], + (i == indirect_blks) ? ar->len : 1, 0); } return err; } @@ -419,9 +406,9 @@ failed: * inode (->i_blocks, etc.). In case of success we end up with the full * chain to new block and return 0. */ -static int ext4_splice_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t block, Indirect *where, int num, - int blks) +static int ext4_splice_branch(handle_t *handle, + struct ext4_allocation_request *ar, + Indirect *where, int num) { int i; int err = 0; @@ -446,9 +433,9 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, * Update the host buffer_head or inode to point to more just allocated * direct blocks blocks */ - if (num == 0 && blks > 1) { + if (num == 0 && ar->len > 1) { current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) + for (i = 1; i < ar->len; i++) *(where->p + i) = cpu_to_le32(current_block++); } @@ -465,14 +452,14 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, */ jbd_debug(5, "splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, where->bh); + err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); if (err) goto err_out; } else { /* * OK, we spliced it into the inode itself on a direct block. */ - ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, ar->inode); jbd_debug(5, "splicing direct\n"); } return err; @@ -484,11 +471,11 @@ err_out: * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. */ - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, EXT4_FREE_BLOCKS_FORGET); } - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), - blks, 0); + ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), + ar->len, 0); return err; } @@ -525,11 +512,11 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { + struct ext4_allocation_request ar; int err = -EIO; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; - ext4_fsblk_t goal; int indirect_blks; int blocks_to_boundary = 0; int depth; @@ -579,7 +566,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, return -ENOSPC; } - goal = ext4_find_goal(inode, map->m_lblk, partial); + /* Set up for the direct block allocation */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.logical = map->m_lblk; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + + ar.goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; @@ -588,13 +582,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, * Next look up the indirect map to count the totoal number of * direct blocks to allocate for this branch. */ - count = ext4_blks_to_allocate(partial, indirect_blks, - map->m_len, blocks_to_boundary); + ar.len = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + /* * Block out ext4_truncate while we alter the tree */ - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, - &count, goal, + err = ext4_alloc_branch(handle, &ar, indirect_blks, offsets + (partial - chain), partial); /* @@ -605,14 +599,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) - err = ext4_splice_branch(handle, inode, map->m_lblk, - partial, indirect_blks, count); + err = ext4_splice_branch(handle, &ar, partial, indirect_blks); if (err) goto cleanup; map->m_flags |= EXT4_MAP_NEW; ext4_update_inode_fsync_trans(handle, inode, 1); + count = ar.len; got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); -- cgit v1.2.3 From e3cf5d5d9a86df1c5e413bdd3725c25a16ff854c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 4 Sep 2014 18:07:25 -0400 Subject: ext4: prepare to drop EXT4_STATE_DELALLOC_RESERVED The EXT4_STATE_DELALLOC_RESERVED flag was originally implemented because it was too hard to make sure the mballoc and get_block flags could be reliably passed down through all of the codepaths that end up calling ext4_mb_new_blocks(). Since then, we have mb_flags passed down through most of the code paths, so getting rid of EXT4_STATE_DELALLOC_RESERVED isn't as tricky as it used to. This commit plumbs in the last of what is required, and then adds a WARN_ON check to make sure we haven't missed anything. If this passes a full regression test run, we can then drop EXT4_STATE_DELALLOC_RESERVED. Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/balloc.c | 3 +-- fs/ext4/extents.c | 6 +++++- fs/ext4/indirect.c | 6 +++++- fs/ext4/mballoc.c | 10 ++++++---- fs/ext4/xattr.c | 6 ------ 5 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 581ef40fbe90..d70f154f6da3 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -636,8 +636,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ - if (!(*errp) && - ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { + if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_alloc_block_nofail(inode, diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3ac1686efff8..8170b3254767 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1933,6 +1933,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ext4_lblk_t next; int mb_flags = 0, unwritten; + if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); return -EIO; @@ -2054,7 +2056,7 @@ prepend: * We're gonna add a new leaf in the tree. */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) - mb_flags = EXT4_MB_USE_RESERVED; + mb_flags |= EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, ppath, newext); if (err) @@ -4438,6 +4440,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.flags = 0; if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) ar.flags |= EXT4_MB_HINT_NOPREALLOC; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 69af0cd64724..36b369697a13 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -333,7 +333,9 @@ static int ext4_alloc_branch(handle_t *handle, new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); } else ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, - ar->inode, ar->goal, 0, NULL, &err); + ar->inode, ar->goal, + ar->flags & EXT4_MB_DELALLOC_RESERVED, + NULL, &err); if (err) { i--; goto failed; @@ -572,6 +574,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ar.logical = map->m_lblk; if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; ar.goal = ext4_find_goal(inode, map->m_lblk, partial); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8b0f9ef517d6..15dffdac5907 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4415,9 +4415,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, * EDQUOT check, as blocks and quotas have been already * reserved when data being copied into pagecache. */ - if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) + if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) { + WARN_ON((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0); ar->flags |= EXT4_MB_DELALLOC_RESERVED; - else { + } + + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. @@ -4528,8 +4531,7 @@ out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { - if (!ext4_test_inode_state(ar->inode, - EXT4_STATE_DELALLOC_RESERVED)) + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e7387337060c..da4df703c211 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -899,14 +899,8 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - /* - * take i_data_sem because we will test - * i_delalloc_reserved_flag in ext4_mb_new_blocks - */ - down_read(&EXT4_I(inode)->i_data_sem); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); - up_read((&EXT4_I(inode)->i_data_sem)); if (error) goto cleanup; -- cgit v1.2.3 From 754cfed6bbcfdea6afb14f2686f7f8d71e94d4e2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 4 Sep 2014 18:08:22 -0400 Subject: ext4: drop the EXT4_STATE_DELALLOC_RESERVED flag Having done a full regression test, we can now drop the DELALLOC_RESERVED state flag. Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 1 - fs/ext4/inode.c | 20 ++++---------------- fs/ext4/mballoc.c | 10 ---------- 3 files changed, 4 insertions(+), 27 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 00fd822ac6e4..4855800fcc5d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1400,7 +1400,6 @@ enum { EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ - EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4a16b0cc02de..d5dd7d46844e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -595,14 +595,6 @@ found: */ down_write(&EXT4_I(inode)->i_data_sem); - /* - * if the caller is from delayed allocation writeout path - * we have already reserved fs blocks for allocation - * let the underlying get_block() function know to - * avoid double accounting - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); /* * We need to check for EXT4 here because migrate * could have changed the inode type in between @@ -631,8 +623,6 @@ found: (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) ext4_da_update_reserve_space(inode, retval, 1); } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); if (retval > 0) { unsigned int status; @@ -2004,12 +1994,10 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * in data loss. So use reserved blocks to allocate metadata if * possible. * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks - * in question are delalloc blocks. This affects functions in many - * different parts of the allocation call path. This flag exists - * primarily because we don't want to change *many* call functions, so - * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag - * once the inode's allocation semaphore is taken. + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if + * the blocks in question are delalloc blocks. This indicates + * that the blocks and quotas has already been checked when + * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 15dffdac5907..65cca2881d71 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4410,16 +4410,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, if (IS_NOQUOTA(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; - /* - * For delayed allocation, we could skip the ENOSPC and - * EDQUOT check, as blocks and quotas have been already - * reserved when data being copied into pagecache. - */ - if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) { - WARN_ON((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0); - ar->flags |= EXT4_MB_DELALLOC_RESERVED; - } - if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation -- cgit v1.2.3 From dc6e8d669cf5cb3ff84707c372c0a2a8a5e80845 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 4 Sep 2014 18:09:22 -0400 Subject: jbd2: don't call get_bh() before calling __jbd2_journal_remove_checkpoint() The __jbd2_journal_remove_checkpoint() doesn't require an elevated b_count; indeed, until the jh structure gets released by the call to jbd2_journal_put_journal_head(), the bh's b_count is elevated by virtue of the existence of the jh structure. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 18c7a8d3da13..90d6091d7e18 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -96,15 +96,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_transaction == NULL && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { - /* - * Get our reference so that bh cannot be freed before - * we unlock it - */ - get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; - BUFFER_TRACE(bh, "release"); - __brelse(bh); } return ret; } @@ -216,7 +209,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) struct buffer_head *bh; transaction_t *transaction; tid_t this_tid; - int result, batch_count = 0, done = 0; + int result, batch_count = 0; jbd_debug(1, "Start checkpoint\n"); @@ -291,11 +284,9 @@ restart: if (!buffer_dirty(bh)) { if (unlikely(buffer_write_io_error(bh)) && !result) result = -EIO; - get_bh(bh); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); spin_unlock(&journal->j_list_lock); - __brelse(bh); goto retry; } /* @@ -338,12 +329,12 @@ restart2: transaction->t_tid != this_tid) goto out; - while (!done && transaction->t_checkpoint_io_list) { + while (transaction->t_checkpoint_io_list) { jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); - get_bh(bh); if (buffer_locked(bh)) { spin_unlock(&journal->j_list_lock); + get_bh(bh); wait_on_buffer(bh); /* the journal_head may have gone by now */ BUFFER_TRACE(bh, "brelse"); @@ -359,8 +350,8 @@ restart2: * know that it has been written out and so we can * drop it from the list */ - done = __jbd2_journal_remove_checkpoint(jh); - __brelse(bh); + if (__jbd2_journal_remove_checkpoint(jh)) + break; } out: spin_unlock(&journal->j_list_lock); -- cgit v1.2.3 From 0e5ecf0a762627b949141df1d83094a9b0eb54a8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 18:09:29 -0400 Subject: jbd2: optimize jbd2_log_do_checkpoint() a bit When we discover written out buffer in transaction checkpoint list we don't have to recheck validity of a transaction. Either this is the last buffer in a transaction - and then we are done - or this isn't and then we can just take another buffer from the checkpoint list without dropping j_list_lock. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 90d6091d7e18..9ffb19cf376b 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -285,9 +285,10 @@ restart: if (unlikely(buffer_write_io_error(bh)) && !result) result = -EIO; BUFFER_TRACE(bh, "remove from checkpoint"); - __jbd2_journal_remove_checkpoint(jh); - spin_unlock(&journal->j_list_lock); - goto retry; + if (__jbd2_journal_remove_checkpoint(jh)) + /* The transaction was released; we're done */ + goto out; + continue; } /* * Important: we are about to write the buffer, and -- cgit v1.2.3 From d26e2c4d72c2f2a38246f618480864fe3224929c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 4 Sep 2014 18:09:29 -0400 Subject: ext4: renumber EXT4_EX_* flags to avoid flag aliasing problems Suggested-by: Andreas Dilger Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4855800fcc5d..f70c3fc94296 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -569,7 +569,6 @@ enum { #define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* Convert written extents to unwritten */ #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 -/* DO NOT ASSIGN ADDITIONAL FLAG VALUES WITHOUT ADJUSTING THE FLAGS BELOW */ /* * The bit position of these flags must not overlap with any of the @@ -580,8 +579,8 @@ enum { * caching the extents when reading from the extent tree while a * truncate or punch hole operation is in progress. */ -#define EXT4_EX_NOCACHE 0x0800 -#define EXT4_EX_FORCE_CACHE 0x1000 +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 /* * Flags used by ext4_free_blocks -- cgit v1.2.3 From 3b5e6454aaf6b4439b19400d8365e2ec2d24e411 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 4 Sep 2014 22:04:42 -0400 Subject: fs/buffer.c: support buffer cache allocations with gfp modifiers A buffer cache is allocated from movable area because it is referred for a while and released soon. But some filesystems are taking buffer cache for a long time and it can disturb page migration. New APIs are introduced to allocate buffer cache with user specific flag. *_gfp APIs are for user want to set page allocation flag for page cache allocation. And *_unmovable APIs are for the user wants to allocate page cache from non-movable area. Signed-off-by: Gioh Kim Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/buffer.c | 45 +++++++++++++++++++++++++------------------ include/linux/buffer_head.h | 47 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 68 insertions(+), 24 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 8f05111bbb8b..9a6029e0dd71 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, */ static int grow_dev_page(struct block_device *bdev, sector_t block, - pgoff_t index, int size, int sizebits) + pgoff_t index, int size, int sizebits, gfp_t gfp) { struct inode *inode = bdev->bd_inode; struct page *page; @@ -1002,8 +1002,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, int ret = 0; /* Will call free_more_memory() */ gfp_t gfp_mask; - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; - gfp_mask |= __GFP_MOVABLE; + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; + /* * XXX: __getblk_slow() can not really deal with failure and * will endlessly loop on improvised global reclaim. Prefer @@ -1058,7 +1058,7 @@ failed: * that page was dirty, the buffers are set dirty also. */ static int -grow_buffers(struct block_device *bdev, sector_t block, int size) +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) { pgoff_t index; int sizebits; @@ -1085,11 +1085,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) } /* Create a page with the proper size buffers.. */ - return grow_dev_page(bdev, block, index, size, sizebits); + return grow_dev_page(bdev, block, index, size, sizebits, gfp); } -static struct buffer_head * -__getblk_slow(struct block_device *bdev, sector_t block, int size) +struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { /* Size must be multiple of hard sectorsize */ if (unlikely(size & (bdev_logical_block_size(bdev)-1) || @@ -1111,13 +1112,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) if (bh) return bh; - ret = grow_buffers(bdev, block, size); + ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; if (ret == 0) free_more_memory(); } } +EXPORT_SYMBOL(__getblk_slow); /* * The relationship between dirty buffers and dirty pages: @@ -1371,24 +1373,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__find_get_block); /* - * __getblk will locate (and, if necessary, create) the buffer_head + * __getblk_gfp() will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. * - * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() - * attempt is failing. FIXME, perhaps? + * __getblk_gfp() will lock up the machine if grow_dev_page's + * try_to_free_buffers() attempt is failing. FIXME, perhaps? */ struct buffer_head * -__getblk(struct block_device *bdev, sector_t block, unsigned size) +__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { struct buffer_head *bh = __find_get_block(bdev, block, size); might_sleep(); if (bh == NULL) - bh = __getblk_slow(bdev, block, size); + bh = __getblk_slow(bdev, block, size, gfp); return bh; } -EXPORT_SYMBOL(__getblk); +EXPORT_SYMBOL(__getblk_gfp); /* * Do async read-ahead on a buffer.. @@ -1404,24 +1407,28 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__breadahead); /** - * __bread() - reads a specified block and returns the bh + * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from * @block: number of block * @size: size (in bytes) to read - * + * @gfp: page allocation flag + * * Reads a specified block, and returns buffer head that contains it. + * The page cache can be allocated from non-movable area + * not to prevent page migration if you set gfp to zero. * It returns NULL if the block was unreadable. */ struct buffer_head * -__bread(struct block_device *bdev, sector_t block, unsigned size) +__bread_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { - struct buffer_head *bh = __getblk(bdev, block, size); + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); return bh; } -EXPORT_SYMBOL(__bread); +EXPORT_SYMBOL(__bread_gfp); /* * invalidate_bh_lrus() is called rarely - but not only at unmount. diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 324329ceea1e..73b45225a7ca 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -175,12 +175,13 @@ void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); -struct buffer_head *__getblk(struct block_device *bdev, sector_t block, - unsigned size); +struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); -struct buffer_head *__bread(struct block_device *, sector_t block, unsigned size); +struct buffer_head *__bread_gfp(struct block_device *, + sector_t block, unsigned size, gfp_t gfp); void invalidate_bh_lrus(void); struct buffer_head *alloc_buffer_head(gfp_t gfp_flags); void free_buffer_head(struct buffer_head * bh); @@ -295,7 +296,13 @@ static inline void bforget(struct buffer_head *bh) static inline struct buffer_head * sb_bread(struct super_block *sb, sector_t block) { - return __bread(sb->s_bdev, block, sb->s_blocksize); + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); +} + +static inline struct buffer_head * +sb_bread_unmovable(struct super_block *sb, sector_t block) +{ + return __bread_gfp(sb->s_bdev, block, sb->s_blocksize, 0); } static inline void @@ -307,7 +314,7 @@ sb_breadahead(struct super_block *sb, sector_t block) static inline struct buffer_head * sb_getblk(struct super_block *sb, sector_t block) { - return __getblk(sb->s_bdev, block, sb->s_blocksize); + return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, __GFP_MOVABLE); } static inline struct buffer_head * @@ -344,6 +351,36 @@ static inline void lock_buffer(struct buffer_head *bh) __lock_buffer(bh); } +static inline struct buffer_head *getblk_unmovable(struct block_device *bdev, + sector_t block, + unsigned size) +{ + return __getblk_gfp(bdev, block, size, 0); +} + +static inline struct buffer_head *__getblk(struct block_device *bdev, + sector_t block, + unsigned size) +{ + return __getblk_gfp(bdev, block, size, __GFP_MOVABLE); +} + +/** + * __bread() - reads a specified block and returns the bh + * @bdev: the block_device to read from + * @block: number of block + * @size: size (in bytes) to read + * + * Reads a specified block, and returns buffer head that contains it. + * The page cache is allocated from movable area so that it can be migrated. + * It returns NULL if the block was unreadable. + */ +static inline struct buffer_head * +__bread(struct block_device *bdev, sector_t block, unsigned size) +{ + return __bread_gfp(bdev, block, size, __GFP_MOVABLE); +} + extern int __set_page_dirty_buffers(struct page *page); #else /* CONFIG_BLOCK */ -- cgit v1.2.3 From a8ac900b8163703340a2fdad11c32f96b8fe686d Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 4 Sep 2014 22:36:15 -0400 Subject: ext4: use non-movable memory for the ext4 superblock Since the ext4 superblock is not released until the file system is unmounted, allocate the buffer cache entry for the ext4 superblock out of the non-moveable are to allow page migrations and thus CMA allocations to more easily succeed if the CMA area is limited. Signed-off-by: Gioh Kim Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 487c65b8cff0..4b81747b3a80 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3436,7 +3436,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) logical_sb_block = sb_block; } - if (!(bh = sb_bread(sb, logical_sb_block))) { + if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); goto out_fail; } @@ -3646,7 +3646,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); - bh = sb_bread(sb, logical_sb_block); + bh = sb_bread_unmovable(sb, logical_sb_block); if (!bh) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); @@ -3868,7 +3868,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); - sbi->s_group_desc[i] = sb_bread(sb, block); + sbi->s_group_desc[i] = sb_bread_unmovable(sb, block); if (!sbi->s_group_desc[i]) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); -- cgit v1.2.3 From a49058fab2912296f068759490ac69ba43b43861 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Thu, 4 Sep 2014 22:36:35 -0400 Subject: jbd/jbd2: use non-movable memory for the jbd superblock Sicne the jbd/jbd2 superblock is not released until the file system is unmounted, allocate the buffer cache from the non-moveable area to allow page migration and CMA allocations to more easily succeed. Signed-off-by: Gioh Kim Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/jbd/journal.c | 2 +- fs/jbd2/journal.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 06fe11e0abfa..aab8549591e7 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -886,7 +886,7 @@ journal_t * journal_init_inode (struct inode *inode) goto out_err; } - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); if (!bh) { printk(KERN_ERR "%s: Cannot get buffer for journal superblock\n", diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 19d74d86d99c..415041c4c40c 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1237,7 +1237,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) goto out_err; } - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); if (!bh) { printk(KERN_ERR "%s: Cannot get buffer for journal superblock\n", -- cgit v1.2.3 From a2d4a646e619541e803fb52636964df39aed94b7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 11 Sep 2014 11:15:15 -0400 Subject: ext4: don't use MAXQUOTAS value MAXQUOTAS value defines maximum number of quota types VFS supports. This isn't necessarily the number of types ext4 supports. Although ext4 will support project quotas, use ext4 private definition for consistency with other filesystems. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 5 ++++- fs/ext4/ext4_jbd2.h | 6 +++--- fs/ext4/super.c | 22 +++++++++++----------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f70c3fc94296..1eb5b7b912a8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1175,6 +1175,9 @@ struct ext4_super_block { #define EXT4_MF_MNTDIR_SAMPLED 0x0001 #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 2 + /* * fourth extended-fs super-block data in memory */ @@ -1238,7 +1241,7 @@ struct ext4_sb_info { u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 17c00ff202f2..9c5b49fb281e 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -102,9 +102,9 @@ #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif -#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) -#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) -#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) static inline int ext4_jbd2_credits_xattr(struct inode *inode) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4b81747b3a80..a318a2dfc79e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -822,7 +822,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_dirtyclusters_counter); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif @@ -2207,7 +2207,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); if (ret < 0) @@ -2263,7 +2263,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn quotas off */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } @@ -4238,7 +4238,7 @@ failed_mount: remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif ext4_blkdev_remove(sbi); @@ -4765,7 +4765,7 @@ struct ext4_mount_options { u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; + char *s_qf_names[EXT4_MAXQUOTAS]; #endif }; @@ -4795,7 +4795,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], GFP_KERNEL); @@ -4956,7 +4956,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA /* Release old quota file names */ - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(old_opts.s_qf_names[i]); if (enable_quota) { if (sb_any_quota_suspended(sb)) @@ -4985,7 +4985,7 @@ restore_opts: sbi->s_max_batch_time = old_opts.s_max_batch_time; #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } @@ -5188,7 +5188,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, { int err; struct inode *qf_inode; - unsigned long qf_inums[MAXQUOTAS] = { + unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) }; @@ -5216,13 +5216,13 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, static int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; - unsigned long qf_inums[MAXQUOTAS] = { + unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED); -- cgit v1.2.3 From 52c198c6820f68b6fbe1d83f76e34a82bf736024 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 11 Sep 2014 11:18:13 -0400 Subject: ext4: add sysfs entry showing whether the fs contains errors Currently there is no easy way to tell that the mounted file system contains errors other than checking for log messages, or reading the information directly from superblock. This patch adds new sysfs entries: errors_count (number of fs errors we encounter) first_error_time (unix timestamp for the first error we see) last_error_time (unix timestamp for the last error we see) If the file system is not marked as containing errors then any of the file will return 0. Otherwise it will contain valid information. More details about the errors should as always be found in the logs. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a318a2dfc79e..ff889e1f11d5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2548,6 +2548,16 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, return count; } +static ssize_t es_ui_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + + unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) + + a->u.offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + static ssize_t reserved_clusters_show(struct ext4_attr *a, struct ext4_sb_info *sbi, char *buf) { @@ -2601,14 +2611,29 @@ static struct ext4_attr ext4_attr_##_name = { \ .offset = offsetof(struct ext4_sb_info, _elname),\ }, \ } + +#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .u = { \ + .offset = offsetof(struct ext4_super_block, _elname), \ + }, \ +} + #define EXT4_ATTR(name, mode, show, store) \ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) #define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) + +#define EXT4_RO_ATTR_ES_UI(name, elname) \ + EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname) #define EXT4_RW_ATTR_SBI_UI(name, elname) \ EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) + #define ATTR_LIST(name) &ext4_attr_##name.attr #define EXT4_DEPRECATED_ATTR(_name, _val) \ static struct ext4_attr ext4_attr_##_name = { \ @@ -2641,6 +2666,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); +EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2664,6 +2692,9 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(warning_ratelimit_burst), ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), + ATTR_LIST(errors_count), + ATTR_LIST(first_error_time), + ATTR_LIST(last_error_time), NULL, }; -- cgit v1.2.3 From c7f725435adcf2ade4b9152ee33339d28f4cc330 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 11 Sep 2014 11:27:58 -0400 Subject: ext4: provide separate operations for sysfs feature files Currently sysfs feature files uses ext4_attr_ops as the file operations to show/store data. However the feature files is not supposed to contain any data at all, the sole existence of the file means that the module support the feature. Moreover, none of the sysfs feature attributes actually register show/store functions so that would not be a problem. However if a sysfs feature attribute register a show or store function we might be in trouble because the kobject in this case is _not_ embedded in the ext4_sb_info structure as ext4_attr_show/store expect. So just to be safe, provide separate empty sysfs_ops to use in ext4_feat_ktype. This might safe us from potential problems in the future. As a bonus we can "store" something more descriptive than nothing in the files, so let it contain "enabled" to make it clear that the feature is really present in the module. Signed-off-by: Lukas Czerner Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ff889e1f11d5..2766b8eba121 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2754,9 +2754,25 @@ static void ext4_feat_release(struct kobject *kobj) complete(&ext4_feat->f_kobj_unregister); } +static ssize_t ext4_feat_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "supported\n"); +} + +/* + * We can not use ext4_attr_show/store because it relies on the kobject + * being embedded in the ext4_sb_info structure which is definitely not + * true in this case. + */ +static const struct sysfs_ops ext4_feat_ops = { + .show = ext4_feat_show, + .store = NULL, +}; + static struct kobj_type ext4_feat_ktype = { .default_attrs = ext4_feat_attrs, - .sysfs_ops = &ext4_attr_ops, + .sysfs_ops = &ext4_feat_ops, .release = ext4_feat_release, }; -- cgit v1.2.3 From 3230bbfce8a9270acc77fafd0d9ff90e94f28993 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 14 Mar 2014 00:34:05 +0800 Subject: ACPI: introduce ACPI int340x thermal scan handler Newer laptops and tablets that use ACPI may have thermal sensors and other devices with thermal control capabilities outside the core CPU/SOC, for thermal safety reasons. They are exposed for the OS to use via 1) INT3400 ACPI device object as the master. 2) INT3401 ~ INT340B ACPI device objects as the slaves. This patch introduces a scan handler to enumerate the INT3400 ACPI device object to platform bus, and prevent its slaves from being enumerated before the controller driver being probed. Signed-off-by: Zhang Rui --- drivers/acpi/Makefile | 1 + drivers/acpi/int340x_thermal.c | 51 ++++++++++++++++++++++++++++++++++++++++++ drivers/acpi/internal.h | 1 + drivers/acpi/scan.c | 1 + drivers/thermal/Kconfig | 17 ++++++++++++++ 5 files changed, 71 insertions(+) create mode 100644 drivers/acpi/int340x_thermal.c diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 505d4d79fe3e..c3b2fcb729f3 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -43,6 +43,7 @@ acpi-y += pci_root.o pci_link.o pci_irq.o acpi-y += acpi_lpss.o acpi-y += acpi_platform.o acpi-y += acpi_pnp.o +acpi-y += int340x_thermal.o acpi-y += power.o acpi-y += event.o acpi-y += sysfs.o diff --git a/drivers/acpi/int340x_thermal.c b/drivers/acpi/int340x_thermal.c new file mode 100644 index 000000000000..2103bb6d9016 --- /dev/null +++ b/drivers/acpi/int340x_thermal.c @@ -0,0 +1,51 @@ +/* + * ACPI support for int340x thermal drivers + * + * Copyright (C) 2014, Intel Corporation + * Authors: Zhang Rui + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include "internal.h" + +#define DO_ENUMERATION 0x01 +static const struct acpi_device_id int340x_thermal_device_ids[] = { + {"INT3400", DO_ENUMERATION }, + {"INT3401"}, + {"INT3402"}, + {"INT3403"}, + {"INT3404"}, + {"INT3406"}, + {"INT3407"}, + {"INT3408"}, + {"INT3409"}, + {"INT340A"}, + {"INT340B"}, + {""}, +}; + +static int int340x_thermal_handler_attach(struct acpi_device *adev, + const struct acpi_device_id *id) +{ +#ifdef CONFIG_INT340X_THERMAL + if (id->driver_data == DO_ENUMERATION) + acpi_create_platform_device(adev); +#endif + return 1; +} + +static struct acpi_scan_handler int340x_thermal_handler = { + .ids = int340x_thermal_device_ids, + .attach = int340x_thermal_handler_attach, +}; + +void __init acpi_int340x_thermal_init(void) +{ + acpi_scan_add_handler(&int340x_thermal_handler); +} diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index 4c5cf77e7576..de47f9f746c9 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -31,6 +31,7 @@ void acpi_pci_link_init(void); void acpi_processor_init(void); void acpi_platform_init(void); void acpi_pnp_init(void); +void acpi_int340x_thermal_init(void); int acpi_sysfs_init(void); void acpi_container_init(void); void acpi_memory_hotplug_init(void); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 0a817ad24f16..eed9740651f8 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -2306,6 +2306,7 @@ int __init acpi_scan_init(void) acpi_container_init(); acpi_memory_hotplug_init(); acpi_pnp_init(); + acpi_int340x_thermal_init(); mutex_lock(&acpi_scan_lock); /* diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 693208eb9047..2ff7416ca930 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -234,6 +234,23 @@ config INTEL_SOC_DTS_THERMAL notification methods.The other trip is a critical trip point, which was set by the driver based on the TJ MAX temperature. +config INT340X_THERMAL + bool + depends on X86 && ACPI + help + Newer laptops and tablets that use ACPI may have thermal sensors and + other devices with thermal control capabilities outside the core + CPU/SOC, for thermal safety reasons. + They are exposed for the OS to use via the INT3400 ACPI device object + as the master, and INT3401~INT340B ACPI device objects as the slaves. + Enable this to expose the temperature information and cooling ability + from these objects to userspace via the normal thermal framework. + This means that a wide range of applications and GUI widgets can show + the information to the user or use this information for making + decisions. For example, the Intel Thermal Daemon can use this + information to allow the user to select his laptop to run without + turning on the fans. + menu "Texas Instruments thermal drivers" source "drivers/thermal/ti-soc-thermal/Kconfig" endmenu -- cgit v1.2.3 From feb8c6d3dd0f2cc0e1c3376d099cf298c5f2c2c8 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 11 Sep 2014 11:38:21 -0400 Subject: jbd2: fix journal checksum feature flag handling Clear all three journal checksum feature flags before turning on whichever journal checksum options we want. Rearrange the error checking so that newer flags get complained about first. Reported-by: TR Reardon Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 11 ++++++----- fs/jbd2/journal.c | 16 ++++++++-------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2766b8eba121..fb219b95f8d2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3237,6 +3237,10 @@ static int set_journal_csum_feature_set(struct super_block *sb) incompat = 0; } + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_CSUM_V3 | + JBD2_FEATURE_INCOMPAT_CSUM_V2); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, @@ -3249,11 +3253,8 @@ static int set_journal_csum_feature_set(struct super_block *sb) jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } else { - jbd2_journal_clear_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | - JBD2_FEATURE_INCOMPAT_CSUM_V3 | - JBD2_FEATURE_INCOMPAT_CSUM_V2); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } return ret; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 415041c4c40c..e4dc74713a43 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1522,14 +1522,6 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (jbd2_journal_has_csum_v2or3(journal) && - JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { - /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " - "at the same time!\n"); - goto out; - } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { /* Can't have checksum v2 and v3 at the same time! */ @@ -1538,6 +1530,14 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (jbd2_journal_has_csum_v2or3(journal) && + JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " + "at the same time!\n"); + goto out; + } + if (!jbd2_verify_csum_type(journal, sb)) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); goto out; -- cgit v1.2.3 From df4763bea5b04d8eed941cfe3df51f22cfe95570 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 11 Sep 2014 11:44:36 -0400 Subject: ext4: validate external journal superblock checksum If the external journal device has metadata_csum enabled, verify that the superblock checksum matches the block before we try to mount. Signed-off-by: Darrick J. Wong Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fb219b95f8d2..263201793c65 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4414,6 +4414,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } + if ((le32_to_cpu(es->s_feature_ro_compat) & + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + es->s_checksum != ext4_superblock_csum(sb, es)) { + ext4_msg(sb, KERN_ERR, "external journal has " + "corrupt superblock"); + brelse(bh); + goto out_bdev; + } + if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); brelse(bh); -- cgit v1.2.3 From 684de5748660e16e185754697ac0afa9e18297f6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 11 Sep 2014 11:45:12 -0400 Subject: ext4: don't keep using page if inline conversion fails If inline->extent conversion fails (most probably due to ENOSPC) and we release the temporary page that we allocated to transfer the file contents, don't keep using the page pointer after releasing the page. This occasionally leads to complaints about evicting locked pages or hangs when blocksize > pagesize, because it's possible for the page to get reallocated elsewhere in the meantime. Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: Tao Ma --- fs/ext4/inline.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bea662bd0ca6..378aadf5e6db 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -594,6 +594,7 @@ retry: if (ret) { unlock_page(page); page_cache_release(page); + page = NULL; ext4_orphan_add(handle, inode); up_write(&EXT4_I(inode)->xattr_sem); sem_held = 0; @@ -613,7 +614,8 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - block_commit_write(page, from, to); + if (page) + block_commit_write(page, from, to); out: if (page) { unlock_page(page); -- cgit v1.2.3 From 047133066e6c2549403fe5a2d619f47ba4212ef5 Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Thu, 7 Aug 2014 05:10:22 -0700 Subject: leds: Reorder include directives Reorder include directives so that they are arranged in alphabetical order. Signed-off-by: Jacek Anaszewski Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- drivers/leds/led-class.c | 13 +++++++------ drivers/leds/led-core.c | 3 ++- include/linux/leds.h | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index aa29198fca3e..4bd4572efe6e 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -9,16 +9,17 @@ * published by the Free Software Foundation. */ -#include -#include +#include +#include +#include #include +#include +#include #include +#include +#include #include -#include #include -#include -#include -#include #include "leds.h" static struct class *leds_class; diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c index 71b40d3bf776..50b579ad948e 100644 --- a/drivers/leds/led-core.c +++ b/drivers/leds/led-core.c @@ -12,10 +12,11 @@ */ #include +#include #include #include +#include #include -#include #include "leds.h" DECLARE_RWSEM(leds_list_lock); diff --git a/include/linux/leds.h b/include/linux/leds.h index e43686472197..4be2d7623d9e 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -13,8 +13,8 @@ #define __LINUX_LEDS_H_INCLUDED #include -#include #include +#include #include #include -- cgit v1.2.3 From d8082827d8a214343b761f2c4554d2a7d1573d63 Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Thu, 7 Aug 2014 05:10:23 -0700 Subject: leds: make brightness type consistent across whole subsystem Documentations states that brightness units type is enum led_brightness and this is the type used by the led API functions. Adjust the type of brightness variables in the struct led_classdev accordingly. Signed-off-by: Jacek Anaszewski Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- include/linux/leds.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/leds.h b/include/linux/leds.h index 4be2d7623d9e..f2e1cbc25705 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -31,8 +31,8 @@ enum led_brightness { struct led_classdev { const char *name; - int brightness; - int max_brightness; + enum led_brightness brightness; + enum led_brightness max_brightness; int flags; /* Lower 16 bits reflect status */ -- cgit v1.2.3 From 3841961269f76db243339a94005729f10829911e Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Thu, 7 Aug 2014 05:10:24 -0700 Subject: leds: avoid using DEVICE_ATTR macro for max_brightness attribute Make definition of the brightness related sysfs attributes consistent. The modification entails change of the function name: led_max_brightness_show -> max_brightness_show Signed-off-by: Jacek Anaszewski Acked-by: Sakari Ailus Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- drivers/leds/led-class.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 4bd4572efe6e..71666a40b79a 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -60,14 +60,14 @@ static ssize_t brightness_store(struct device *dev, } static DEVICE_ATTR_RW(brightness); -static ssize_t led_max_brightness_show(struct device *dev, +static ssize_t max_brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { struct led_classdev *led_cdev = dev_get_drvdata(dev); return sprintf(buf, "%u\n", led_cdev->max_brightness); } -static DEVICE_ATTR(max_brightness, 0444, led_max_brightness_show, NULL); +static DEVICE_ATTR_RO(max_brightness); #ifdef CONFIG_LEDS_TRIGGERS static DEVICE_ATTR(trigger, 0644, led_trigger_show, led_trigger_store); -- cgit v1.2.3 From 7f14e6b9c36f6696eb937bc0cf86a7732aa89904 Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Fri, 8 Aug 2014 00:09:44 -0700 Subject: leds: lp3944: fix sparse warning Fix sparse warning appeared after changing brightness type in the leds.h from int to enum led_brightness. Signed-off-by: Jacek Anaszewski Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- drivers/leds/leds-lp3944.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/leds/leds-lp3944.c b/drivers/leds/leds-lp3944.c index 8e1abdcd4c9d..53144fb96167 100644 --- a/drivers/leds/leds-lp3944.c +++ b/drivers/leds/leds-lp3944.c @@ -335,7 +335,8 @@ static int lp3944_configure(struct i2c_client *client, } /* to expose the default value to userspace */ - led->ldev.brightness = led->status; + led->ldev.brightness = + (enum led_brightness) led->status; /* Set the default led status */ err = lp3944_led_set(led, led->status); -- cgit v1.2.3 From 914ae25a62e77ebbfa0ce7cbc60edd01cc4d1d31 Mon Sep 17 00:00:00 2001 From: Lothar Waßmann Date: Tue, 9 Sep 2014 00:40:32 -0700 Subject: leds: trigger: gpio: fix warning in gpio trigger for gpios whose accessor function may sleep MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When using a GPIO driver whose accessor functions may sleep (e.g. an I2C GPIO extender like PCA9554) the following warning is issued: WARNING: CPU: 0 PID: 665 at drivers/gpio/gpiolib.c:2274 gpiod_get_raw_value+0x3c/0x48() Modules linked in: CPU: 0 PID: 665 Comm: kworker/0:2 Not tainted 3.16.0-karo+ #115 Workqueue: events gpio_trig_work [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [] (show_stack) from [] (warn_slowpath_common+0x64/0x84) [] (warn_slowpath_common) from [] (warn_slowpath_null+0x1c/0x24) [] (warn_slowpath_null) from [] (gpiod_get_raw_value+0x3c/0x48) [] (gpiod_get_raw_value) from [] (gpio_trig_work+0x1c/0xb0) [] (gpio_trig_work) from [] (process_one_work+0x144/0x38c) [] (process_one_work) from [] (worker_thread+0x60/0x5cc) [] (worker_thread) from [] (kthread+0xb4/0xd0) [] (kthread) from [] (ret_from_fork+0x14/0x24) ---[ end trace cd51a1dad8b86c9c ]--- Fix this by using the _cansleep() variant of gpio_get_value(). Signed-off-by: Lothar Waßmann Signed-off-by: Bryan Wu --- drivers/leds/trigger/ledtrig-gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/trigger/ledtrig-gpio.c b/drivers/leds/trigger/ledtrig-gpio.c index 35812e3a37f2..c86c41826476 100644 --- a/drivers/leds/trigger/ledtrig-gpio.c +++ b/drivers/leds/trigger/ledtrig-gpio.c @@ -48,7 +48,7 @@ static void gpio_trig_work(struct work_struct *work) if (!gpio_data->gpio) return; - tmp = gpio_get_value(gpio_data->gpio); + tmp = gpio_get_value_cansleep(gpio_data->gpio); if (gpio_data->inverted) tmp = !tmp; -- cgit v1.2.3 From 3ef7de5304edf60d0b8674dd7cdacc104e15a93c Mon Sep 17 00:00:00 2001 From: Jacek Anaszewski Date: Wed, 20 Aug 2014 06:41:55 -0700 Subject: leds: Improve and export led_update_brightness led_update_brightness helper function used to be exploited only locally in the led-class.c module, where its result was being passed to the brightness_show sysfs callback. With the introduction of v4l2-flash subdevice the same functionality becomes required for reading current brightness from a LED device. This patch adds checking of return value of the brightness_get callback and moves the led_update_brightness() function to the LED subsystem public API. Signed-off-by: Jacek Anaszewski Acked-by: Kyungmin Park Cc: Richard Purdie Signed-off-by: Bryan Wu --- drivers/leds/led-class.c | 6 ------ drivers/leds/led-core.c | 16 ++++++++++++++++ include/linux/leds.h | 10 ++++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 71666a40b79a..7440c58b8e6f 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -24,12 +24,6 @@ static struct class *leds_class; -static void led_update_brightness(struct led_classdev *led_cdev) -{ - if (led_cdev->brightness_get) - led_cdev->brightness = led_cdev->brightness_get(led_cdev); -} - static ssize_t brightness_show(struct device *dev, struct device_attribute *attr, char *buf) { diff --git a/drivers/leds/led-core.c b/drivers/leds/led-core.c index 50b579ad948e..aaa8eba9099f 100644 --- a/drivers/leds/led-core.c +++ b/drivers/leds/led-core.c @@ -127,3 +127,19 @@ void led_set_brightness(struct led_classdev *led_cdev, __led_set_brightness(led_cdev, brightness); } EXPORT_SYMBOL(led_set_brightness); + +int led_update_brightness(struct led_classdev *led_cdev) +{ + int ret = 0; + + if (led_cdev->brightness_get) { + ret = led_cdev->brightness_get(led_cdev); + if (ret >= 0) { + led_cdev->brightness = ret; + return 0; + } + } + + return ret; +} +EXPORT_SYMBOL(led_update_brightness); diff --git a/include/linux/leds.h b/include/linux/leds.h index f2e1cbc25705..a57611d0c94e 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -140,6 +140,16 @@ extern void led_blink_set_oneshot(struct led_classdev *led_cdev, */ extern void led_set_brightness(struct led_classdev *led_cdev, enum led_brightness brightness); +/** + * led_update_brightness - update LED brightness + * @led_cdev: the LED to query + * + * Get an LED's current brightness and update led_cdev->brightness + * member with the obtained value. + * + * Returns: 0 on success or negative error value on failure + */ +extern int led_update_brightness(struct led_classdev *led_cdev); /* * LED Triggers -- cgit v1.2.3 From e1ea97fef0cd579fd7ef3851548e068eaf2ad9f0 Mon Sep 17 00:00:00 2001 From: Himangi Saraogi Date: Thu, 17 Jul 2014 02:29:53 +0530 Subject: target/configfs: Remove unnecessary null test This patch removes the null test on lun_cg. lun_cg is initialized at the beginning of the function to &lun->lun_group. Since lun_cg is dereferenced prior to the null test, it must be a valid pointer. The following Coccinelle script is used for detecting the change: @r@ expression e,f; identifier g,y; statement S1,S2; @@ *e = &f->g <+... f->y ...+> *if (e != NULL || ...) S1 else S2 Signed-off-by: Himangi Saraogi Acked-by: Julia Lawall Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_fabric_configfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c index 7de9f0475d05..7228a18b12ad 100644 --- a/drivers/target/target_core_fabric_configfs.c +++ b/drivers/target/target_core_fabric_configfs.c @@ -917,8 +917,7 @@ static struct config_group *target_fabric_make_lun( return &lun->lun_group; out: - if (lun_cg) - kfree(lun_cg->default_groups); + kfree(lun_cg->default_groups); return ERR_PTR(errno); } -- cgit v1.2.3 From c04047eceed45ae210d020868672456c33cae300 Mon Sep 17 00:00:00 2001 From: Andreea-Cristina Bernat Date: Mon, 18 Aug 2014 15:05:37 +0300 Subject: tcm_fc: Replace rcu_assign_pointer() with RCU_INIT_POINTER() The use of "rcu_assign_pointer()" is NULLing out the pointer. According to RCU_INIT_POINTER()'s block comment: "1. This use of RCU_INIT_POINTER() is NULLing out the pointer" it is better to use it instead of rcu_assign_pointer() because it has a smaller overhead. The following Coccinelle semantic patch was used: @@ @@ - rcu_assign_pointer + RCU_INIT_POINTER (..., NULL) Signed-off-by: Andreea-Cristina Bernat Signed-off-by: Nicholas Bellinger --- drivers/target/tcm_fc/tfc_sess.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/tcm_fc/tfc_sess.c b/drivers/target/tcm_fc/tfc_sess.c index 21ce50880c79..ccee7e332a4d 100644 --- a/drivers/target/tcm_fc/tfc_sess.c +++ b/drivers/target/tcm_fc/tfc_sess.c @@ -98,7 +98,7 @@ static void ft_tport_delete(struct ft_tport *tport) ft_sess_delete_all(tport); lport = tport->lport; BUG_ON(tport != lport->prov[FC_TYPE_FCP]); - rcu_assign_pointer(lport->prov[FC_TYPE_FCP], NULL); + RCU_INIT_POINTER(lport->prov[FC_TYPE_FCP], NULL); tpg = tport->tpg; if (tpg) { -- cgit v1.2.3 From c6c2a3de36b1e45841888e27bc2f85ef4e471ad3 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Fri, 22 Aug 2014 14:54:31 +0200 Subject: target: target_core_ua_h: Add #define of include guard Clearly the file was meant to contain an include guard, but it was missing the #define part. Signed-off-by: Rasmus Villemoes Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_ua.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/target/target_core_ua.h b/drivers/target/target_core_ua.h index be912b36daae..a6b56b364e7a 100644 --- a/drivers/target/target_core_ua.h +++ b/drivers/target/target_core_ua.h @@ -1,4 +1,5 @@ #ifndef TARGET_CORE_UA_H +#define TARGET_CORE_UA_H /* * From spc4r17, Table D.1: ASC and ASCQ Assignement -- cgit v1.2.3 From a0626e75954078cfacddb00a4545dde821170bc5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 16 Sep 2014 14:34:59 -0400 Subject: ext4: check EA value offset when loading When loading extended attributes, check each entry's value offset to make sure it doesn't collide with the entries. Without this check it is easy to crash the kernel by mounting a malicious FS containing a file with an EA wherein e_value_offs = 0 and e_value_size > 0 and then deleting the EA, which corrupts the name list. (See the f_ea_value_crash test's FS image in e2fsprogs for an example.) Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/xattr.c | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index da4df703c211..42823ab3718c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -190,14 +190,28 @@ ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) } static int -ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) +ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, + void *value_start) { - while (!IS_LAST_ENTRY(entry)) { - struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry); + struct ext4_xattr_entry *e = entry; + + while (!IS_LAST_ENTRY(e)) { + struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) return -EIO; - entry = next; + e = next; } + + while (!IS_LAST_ENTRY(entry)) { + if (entry->e_value_size != 0 && + (value_start + le16_to_cpu(entry->e_value_offs) < + (void *)e + sizeof(__u32) || + value_start + le16_to_cpu(entry->e_value_offs) + + le32_to_cpu(entry->e_value_size) > end)) + return -EIO; + entry = EXT4_XATTR_NEXT(entry); + } + return 0; } @@ -214,7 +228,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) return -EIO; if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) return -EIO; - error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data); if (!error) set_buffer_verified(bh); return error; @@ -331,7 +346,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, header = IHDR(inode, raw_inode); entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(entry, end); + error = ext4_xattr_check_names(entry, end, entry); if (error) goto cleanup; error = ext4_xattr_find_entry(&entry, name_index, name, @@ -463,7 +478,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(IFIRST(header), end); + error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); if (error) goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), @@ -980,7 +995,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = ext4_xattr_check_names(IFIRST(header), is->s.end); + error = ext4_xattr_check_names(IFIRST(header), is->s.end, + IFIRST(header)); if (error) return error; /* Find the named attribute. */ -- cgit v1.2.3 From 064d83892e9ba547f7d4eae22cbca066d95210ce Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 16 Sep 2014 14:43:09 -0400 Subject: jbd2: free bh when descriptor block checksum fails Free the buffer head if the journal descriptor block fails checksum verification. This is the jbd2 port of the e2fsprogs patch "e2fsck: free bh on csum verify error in do_one_pass". Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Reviewed-by: Eric Sandeen Cc: stable@vger.kernel.org --- fs/jbd2/recovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 9b329b55ffe3..bcbef08a4d8f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -525,6 +525,7 @@ static int do_one_pass(journal_t *journal, !jbd2_descr_block_csum_verify(journal, bh->b_data)) { err = -EIO; + brelse(bh); goto failed; } -- cgit v1.2.3 From 1245799f752fa817a030b3b4448466e83ee7d61d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 16 Sep 2014 14:50:50 -0400 Subject: jbd2: jbd2_log_wait_for_space improve error detetcion If EIO happens after we have dropped j_state_lock, we won't notice that the journal has been aborted. So it is reasonable to move this check after we have grabbed the j_checkpoint_mutex and re-grabbed the j_state_lock. This patch helps to prevent false positive complain after EIO. #DMESG: __jbd2_log_wait_for_space: needed 8448 blocks and only had 8386 space available __jbd2_log_wait_for_space: no way to get more journal space in ram1-8 ------------[ cut here ]------------ WARNING: CPU: 15 PID: 6739 at fs/jbd2/checkpoint.c:168 __jbd2_log_wait_for_space+0x188/0x200() Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod CPU: 15 PID: 6739 Comm: fsstress Tainted: G W 3.17.0-rc2-00429-g684de57 #139 Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011 00000000000000a8 ffff88077aaab878 ffffffff815c1a8c 00000000000000a8 0000000000000000 ffff88077aaab8b8 ffffffff8106ce8c ffff88077aaab898 ffff8807c57e6000 ffff8807c57e6028 0000000000002100 ffff8807c57e62f0 Call Trace: [] dump_stack+0x51/0x6d [] warn_slowpath_common+0x8c/0xc0 [] warn_slowpath_null+0x1a/0x20 [] __jbd2_log_wait_for_space+0x188/0x200 [] start_this_handle+0x4da/0x7b0 [] ? local_clock+0x25/0x30 [] ? lockdep_init_map+0xe7/0x180 [] jbd2__journal_start+0xdc/0x1d0 [] ? __ext4_new_inode+0x7f4/0x1330 [] __ext4_journal_start_sb+0xf8/0x110 [] __ext4_new_inode+0x7f4/0x1330 [] ? lock_release_holdtime+0x29/0x190 [] ext4_create+0x8b/0x150 [] vfs_create+0x7b/0xb0 [] do_last+0x7db/0xcf0 [] ? inode_permission+0x4d/0x50 [] path_openat+0x242/0x590 [] ? __alloc_fd+0x36/0x140 [] do_filp_open+0x4a/0xb0 [] ? __alloc_fd+0x121/0x140 [] do_sys_open+0x170/0x220 [] SyS_open+0x1e/0x20 [] SyS_creat+0x16/0x20 [] system_call_fastpath+0x16/0x1b ---[ end trace cd71c831f82059db ]--- Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9ffb19cf376b..1fbf59938cc0 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -115,8 +115,6 @@ void __jbd2_log_wait_for_space(journal_t *journal) nblocks = jbd2_space_needed(journal); while (jbd2_log_space_left(journal) < nblocks) { - if (journal->j_flags & JBD2_ABORT) - return; write_unlock(&journal->j_state_lock); mutex_lock(&journal->j_checkpoint_mutex); @@ -132,6 +130,10 @@ void __jbd2_log_wait_for_space(journal_t *journal) * trace for forensic evidence. */ write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) { + mutex_unlock(&journal->j_checkpoint_mutex); + return; + } spin_lock(&journal->j_list_lock); nblocks = jbd2_space_needed(journal); space_left = jbd2_log_space_left(journal); -- cgit v1.2.3 From 844749764b416ee2c4ba2da328c04eaad7388242 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Tue, 16 Sep 2014 14:52:03 -0400 Subject: ext4: explicitly inform user about orphan list cleanup Production fs likely compiled/mounted w/o jbd debugging, so orphan list clearing will be silent. Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 263201793c65..028935fc2760 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2191,7 +2191,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { - jbd_debug(1, "Errors on filesystem, " + ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list.\n"); es->s_last_orphan = 0; } -- cgit v1.2.3 From fbecb6596a80554423d00aba92f2752a2ee0a62d Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 2 Sep 2014 17:49:47 -0400 Subject: iscsi-target: remove unused debug code Last user of buf was removed with c6037cc546ca. While at it, free_cpumask_var() handles a NULL argument just fine, so remove the conditionals. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/iscsi/iscsi_target.c | 6 +----- drivers/target/iscsi/iscsi_target_login.c | 3 +-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 1f4c794f5fcc..30f4a7d42e32 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -3709,7 +3709,6 @@ static inline void iscsit_thread_check_cpumask( struct task_struct *p, int mode) { - char buf[128]; /* * mode == 1 signals iscsi_target_tx_thread() usage. * mode == 0 signals iscsi_target_rx_thread() usage. @@ -3728,8 +3727,6 @@ static inline void iscsit_thread_check_cpumask( * both TX and RX kthreads are scheduled to run on the * same CPU. */ - memset(buf, 0, 128); - cpumask_scnprintf(buf, 128, conn->conn_cpumask); set_cpus_allowed_ptr(p, conn->conn_cpumask); } @@ -4326,8 +4323,7 @@ int iscsit_close_connection( if (conn->conn_tx_hash.tfm) crypto_free_hash(conn->conn_tx_hash.tfm); - if (conn->conn_cpumask) - free_cpumask_var(conn->conn_cpumask); + free_cpumask_var(conn->conn_cpumask); kfree(conn->conn_ops); conn->conn_ops = NULL; diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 5e71ac609418..b1ae5cbe70f8 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -1190,8 +1190,7 @@ old_sess_out: if (!IS_ERR(conn->conn_tx_hash.tfm)) crypto_free_hash(conn->conn_tx_hash.tfm); - if (conn->conn_cpumask) - free_cpumask_var(conn->conn_cpumask); + free_cpumask_var(conn->conn_cpumask); kfree(conn->conn_ops); -- cgit v1.2.3 From cb35484231e0b7edf23e192867e5fba955e584cb Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 2 Sep 2014 17:49:50 -0400 Subject: iscsi-target: remove always-true conditions Found by coverity. InitiatorName and InitiatorAlias are static arrays and therefore always non-NULL. At some point in the past they may have been dynamically allocated, but for current code the condition is useless. If the intent was to check InitiatorName[0] instead, I cannot find a use for that either. Let's get rid of it. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/iscsi/iscsi_target_configfs.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_configfs.c b/drivers/target/iscsi/iscsi_target_configfs.c index ae03f3e5de1e..9059c1e0b26e 100644 --- a/drivers/target/iscsi/iscsi_target_configfs.c +++ b/drivers/target/iscsi/iscsi_target_configfs.c @@ -669,12 +669,10 @@ static ssize_t lio_target_nacl_show_info( } else { sess = se_sess->fabric_sess_ptr; - if (sess->sess_ops->InitiatorName) - rb += sprintf(page+rb, "InitiatorName: %s\n", - sess->sess_ops->InitiatorName); - if (sess->sess_ops->InitiatorAlias) - rb += sprintf(page+rb, "InitiatorAlias: %s\n", - sess->sess_ops->InitiatorAlias); + rb += sprintf(page+rb, "InitiatorName: %s\n", + sess->sess_ops->InitiatorName); + rb += sprintf(page+rb, "InitiatorAlias: %s\n", + sess->sess_ops->InitiatorAlias); rb += sprintf(page+rb, "LIO Session ID: %u " "ISID: 0x%02x %02x %02x %02x %02x %02x " -- cgit v1.2.3 From 5c22e2294156377b7e2d2d99aaffea9ae6994452 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 2 Sep 2014 17:49:51 -0400 Subject: iscsi-target: simplify return statement The return statement cannot be reached without either recovery or dump being set to 1. Therefore the condition always evaluates to true and recovery and dump are useless variables. Found by Coverity. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/iscsi/iscsi_target_erl0.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_erl0.c b/drivers/target/iscsi/iscsi_target_erl0.c index 0d1e6ee3e992..a0ae5fc0ad75 100644 --- a/drivers/target/iscsi/iscsi_target_erl0.c +++ b/drivers/target/iscsi/iscsi_target_erl0.c @@ -345,7 +345,6 @@ static int iscsit_dataout_check_datasn( struct iscsi_cmd *cmd, unsigned char *buf) { - int dump = 0, recovery = 0; u32 data_sn = 0; struct iscsi_conn *conn = cmd->conn; struct iscsi_data *hdr = (struct iscsi_data *) buf; @@ -370,13 +369,11 @@ static int iscsit_dataout_check_datasn( pr_err("Command ITT: 0x%08x, received DataSN: 0x%08x" " higher than expected 0x%08x.\n", cmd->init_task_tag, be32_to_cpu(hdr->datasn), data_sn); - recovery = 1; goto recover; } else if (be32_to_cpu(hdr->datasn) < data_sn) { pr_err("Command ITT: 0x%08x, received DataSN: 0x%08x" " lower than expected 0x%08x, discarding payload.\n", cmd->init_task_tag, be32_to_cpu(hdr->datasn), data_sn); - dump = 1; goto dump; } @@ -392,8 +389,7 @@ dump: if (iscsit_dump_data_payload(conn, payload_length, 1) < 0) return DATAOUT_CANNOT_RECOVER; - return (recovery || dump) ? DATAOUT_WITHIN_COMMAND_RECOVERY : - DATAOUT_NORMAL; + return DATAOUT_WITHIN_COMMAND_RECOVERY; } static int iscsit_dataout_pre_datapduinorder_yes( -- cgit v1.2.3 From 1d30686da4a40029cb48eab28442896b58aeceef Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Wed, 17 Sep 2014 13:17:55 -0700 Subject: iscsi-target: Drop duplicate __iscsi_target_login_thread check This patch drops the now duplicate + unnecessary check for -ENODEV from iscsi_transport->iscsit_accept_np() for jumping to out:, or immediately returning 1 in __iscsi_target_login_thread() code. Since commit 81a9c5e72b the jump to out: and returning 1 have the same effect, and end up hitting the ISCSI_NP_THREAD_SHUTDOWN check regardless at the top of __iscsi_target_login_thread() during next loop iteration. So that said, it's safe to go ahead and remove this duplicate check. Reported-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/iscsi/iscsi_target_login.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index b1ae5cbe70f8..02d5ccd0eaa1 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -1267,8 +1267,6 @@ static int __iscsi_target_login_thread(struct iscsi_np *np) iscsit_put_transport(conn->conn_transport); kfree(conn); conn = NULL; - if (ret == -ENODEV) - goto out; /* Get another socket */ return 1; } -- cgit v1.2.3 From 94e16e9c59312247de199b5f9bf141d1bd946dd0 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 2 Sep 2014 17:49:53 -0400 Subject: iscsi-target: remove unnecessary check in iscsit_setup_np error path Found by coverity. At this point sock is non-NULL, so the check to unnecessary. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/iscsi/iscsi_target_login.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 02d5ccd0eaa1..480f2e0ecc11 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -978,8 +978,7 @@ int iscsit_setup_np( return 0; fail: np->np_socket = NULL; - if (sock) - sock_release(sock); + sock_release(sock); return ret; } -- cgit v1.2.3 From fdc84d11a278d468052afc8e17523545fafe6c5f Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 2 Sep 2014 17:49:55 -0400 Subject: iscsi-target: use strlcpy in iscsit_collect_login_stats last_intr_fail_name is a fixed-size array and could theoretically overflow. In reality intrname->value doesn't seem to depend on untrusted input or be anywhere near 224 characters, so the overflow is pretty theoretical. But strlcpy is cheap enough. Found by coverity. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/iscsi/iscsi_target_util.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/target/iscsi/iscsi_target_util.c b/drivers/target/iscsi/iscsi_target_util.c index fd90b28f1d94..5d611d7ba282 100644 --- a/drivers/target/iscsi/iscsi_target_util.c +++ b/drivers/target/iscsi/iscsi_target_util.c @@ -1479,8 +1479,9 @@ void iscsit_collect_login_stats( if (conn->param_list) intrname = iscsi_find_param_from_key(INITIATORNAME, conn->param_list); - strcpy(ls->last_intr_fail_name, - (intrname ? intrname->value : "Unknown")); + strlcpy(ls->last_intr_fail_name, + (intrname ? intrname->value : "Unknown"), + sizeof(ls->last_intr_fail_name)); ls->last_intr_fail_ip_family = conn->login_family; -- cgit v1.2.3 From 8d2135592d2ab5c8d7764a4f534afac64e563691 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 2 Sep 2014 17:49:56 -0400 Subject: target: Fix possible memory leak in aptpl_metadata parsing Each case of match_strdup could leak memory if the same argument was present before. I am not too concerned, as it would require a non-sensical combination like "target_lun=foo target_lun=bar", done with root privileges and even then leak just a few bytes per instance. But arg_p is different, as it will always leak memory. Let's plug that one. And while at it, replace some &args[0] with args. Found by coverity. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_configfs.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index bf55c5a04cfa..291dc711fbc3 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -1263,7 +1263,7 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata( { unsigned char *i_fabric = NULL, *i_port = NULL, *isid = NULL; unsigned char *t_fabric = NULL, *t_port = NULL; - char *orig, *ptr, *arg_p, *opts; + char *orig, *ptr, *opts; substring_t args[MAX_OPT_ARGS]; unsigned long long tmp_ll; u64 sa_res_key = 0; @@ -1295,14 +1295,14 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata( token = match_token(ptr, tokens, args); switch (token) { case Opt_initiator_fabric: - i_fabric = match_strdup(&args[0]); + i_fabric = match_strdup(args); if (!i_fabric) { ret = -ENOMEM; goto out; } break; case Opt_initiator_node: - i_port = match_strdup(&args[0]); + i_port = match_strdup(args); if (!i_port) { ret = -ENOMEM; goto out; @@ -1316,7 +1316,7 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata( } break; case Opt_initiator_sid: - isid = match_strdup(&args[0]); + isid = match_strdup(args); if (!isid) { ret = -ENOMEM; goto out; @@ -1330,15 +1330,9 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata( } break; case Opt_sa_res_key: - arg_p = match_strdup(&args[0]); - if (!arg_p) { - ret = -ENOMEM; - goto out; - } - ret = kstrtoull(arg_p, 0, &tmp_ll); + ret = kstrtoull(args->from, 0, &tmp_ll); if (ret < 0) { - pr_err("kstrtoull() failed for" - " sa_res_key=\n"); + pr_err("kstrtoull() failed for sa_res_key=\n"); goto out; } sa_res_key = (u64)tmp_ll; @@ -1370,14 +1364,14 @@ static ssize_t target_core_dev_pr_store_attr_res_aptpl_metadata( * PR APTPL Metadata for Target Port */ case Opt_target_fabric: - t_fabric = match_strdup(&args[0]); + t_fabric = match_strdup(args); if (!t_fabric) { ret = -ENOMEM; goto out; } break; case Opt_target_node: - t_port = match_strdup(&args[0]); + t_port = match_strdup(args); if (!t_port) { ret = -ENOMEM; goto out; -- cgit v1.2.3 From da0abaee4793bac4047b3bdfd221fc54850bbf5f Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 2 Sep 2014 17:49:57 -0400 Subject: target: Fix memory leak on error in target_fabric_make_mappedlun This patch fixes a memory leak on error in target_fabric_make_mappedlun(), where se_lun_acl memory does not get released on exit. Found by coverity. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_fabric_configfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c index 7228a18b12ad..0638a672d911 100644 --- a/drivers/target/target_core_fabric_configfs.c +++ b/drivers/target/target_core_fabric_configfs.c @@ -320,7 +320,7 @@ static struct config_group *target_fabric_make_mappedlun( struct se_node_acl, acl_group); struct se_portal_group *se_tpg = se_nacl->se_tpg; struct target_fabric_configfs *tf = se_tpg->se_tpg_wwn->wwn_tf; - struct se_lun_acl *lacl; + struct se_lun_acl *lacl = NULL; struct config_item *acl_ci; struct config_group *lacl_cg = NULL, *ml_stat_grp = NULL; char *buf; @@ -406,6 +406,7 @@ static struct config_group *target_fabric_make_mappedlun( out: if (lacl_cg) kfree(lacl_cg->default_groups); + kfree(lacl); kfree(buf); return ERR_PTR(ret); } -- cgit v1.2.3 From 1481473b5656d8841f63c455594f340306c22cb0 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Wed, 17 Sep 2014 15:11:28 -0700 Subject: target: simplify target_fabric_make_lun error path Coverity complained that lun_cg has been dereferenced in all paths leading to NULL check. It didn't mention that only a single path could lead there and the code can be simplified even further. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_fabric_configfs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c index 0638a672d911..dc6c781732ee 100644 --- a/drivers/target/target_core_fabric_configfs.c +++ b/drivers/target/target_core_fabric_configfs.c @@ -911,15 +911,12 @@ static struct config_group *target_fabric_make_lun( GFP_KERNEL); if (!port_stat_grp->default_groups) { pr_err("Unable to allocate port_stat_grp->default_groups\n"); - errno = -ENOMEM; - goto out; + kfree(lun_cg->default_groups); + return ERR_PTR(-ENOMEM); } target_stat_setup_port_default_groups(lun); return &lun->lun_group; -out: - kfree(lun_cg->default_groups); - return ERR_PTR(errno); } static void target_fabric_drop_lun( -- cgit v1.2.3 From 68edbce4fb4b173d3b9880967cfcce0fc3abc8d5 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 2 Sep 2014 17:49:59 -0400 Subject: target: fix pr_out length in iscsi_parse_pr_out_transport_id Old code in iscsi_parse_pr_out_transport_id() was obviously buggy and effectively ignored the high byte. Found by coverity. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_fabric_lib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/target/target_core_fabric_lib.c b/drivers/target/target_core_fabric_lib.c index 0d1cf8b4f49f..35bfe77160d8 100644 --- a/drivers/target/target_core_fabric_lib.c +++ b/drivers/target/target_core_fabric_lib.c @@ -394,9 +394,9 @@ char *iscsi_parse_pr_out_transport_id( * If the caller wants the TransportID Length, we set that value for the * entire iSCSI Tarnsport ID now. */ - if (out_tid_len != NULL) { - add_len = ((buf[2] >> 8) & 0xff); - add_len |= (buf[3] & 0xff); + if (out_tid_len) { + /* The shift works thanks to integer promotion rules */ + add_len = (buf[2] << 8) | buf[3]; tid_len = strlen(&buf[4]); tid_len += 4; /* Add four bytes for iSCSI Transport ID header */ -- cgit v1.2.3 From ce31c1b0dc4038a1dec64585d892adb73d9c45f4 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 2 Sep 2014 17:50:00 -0400 Subject: target: correctly handle match_int errors in FILEIO + PSCSI This patch correctly handles match_int() errors in FILEIO + PSCSI backend parameter parsing, which can potentially fail due to a memory allocation failure or invalid argument. Found by coverity. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_file.c | 4 +++- drivers/target/target_core_pscsi.c | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index 7d6cddaec525..ab2c53b63cc3 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -762,7 +762,9 @@ static ssize_t fd_set_configfs_dev_params(struct se_device *dev, fd_dev->fbd_flags |= FBDF_HAS_SIZE; break; case Opt_fd_buffered_io: - match_int(args, &arg); + ret = match_int(args, &arg); + if (ret) + goto out; if (arg != 1) { pr_err("bogus fd_buffered_io=%d value\n", arg); ret = -EINVAL; diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 943b1dbe859a..a1690a3fdd7f 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -749,14 +749,18 @@ static ssize_t pscsi_set_configfs_dev_params(struct se_device *dev, ret = -EINVAL; goto out; } - match_int(args, &arg); + ret = match_int(args, &arg); + if (ret) + goto out; pdv->pdv_host_id = arg; pr_debug("PSCSI[%d]: Referencing SCSI Host ID:" " %d\n", phv->phv_host_id, pdv->pdv_host_id); pdv->pdv_flags |= PDF_HAS_VIRT_HOST_ID; break; case Opt_scsi_channel_id: - match_int(args, &arg); + ret = match_int(args, &arg); + if (ret) + goto out; pdv->pdv_channel_id = arg; pr_debug("PSCSI[%d]: Referencing SCSI Channel" " ID: %d\n", phv->phv_host_id, @@ -764,7 +768,9 @@ static ssize_t pscsi_set_configfs_dev_params(struct se_device *dev, pdv->pdv_flags |= PDF_HAS_CHANNEL_ID; break; case Opt_scsi_target_id: - match_int(args, &arg); + ret = match_int(args, &arg); + if (ret) + goto out; pdv->pdv_target_id = arg; pr_debug("PSCSI[%d]: Referencing SCSI Target" " ID: %d\n", phv->phv_host_id, @@ -772,7 +778,9 @@ static ssize_t pscsi_set_configfs_dev_params(struct se_device *dev, pdv->pdv_flags |= PDF_HAS_TARGET_ID; break; case Opt_scsi_lun_id: - match_int(args, &arg); + ret = match_int(args, &arg); + if (ret) + goto out; pdv->pdv_lun_id = arg; pr_debug("PSCSI[%d]: Referencing SCSI LUN ID:" " %d\n", phv->phv_host_id, pdv->pdv_lun_id); -- cgit v1.2.3 From c435285df112da1125e61d826b03014a4e769386 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 2 Sep 2014 17:50:01 -0400 Subject: target: fix unused shift in core_scsi3_pri_report_capabilities Clearly a right-shift was meant. Effectively doesn't make a difference, as add_len is hard-coded to 8 and the high byte will be zero no matter which way you shift. But I hate leaving bad examples for others to copy. Found by coverity. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_pr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index df357862286e..281d52e3fe99 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -3803,7 +3803,7 @@ core_scsi3_pri_report_capabilities(struct se_cmd *cmd) if (!buf) return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; - buf[0] = ((add_len << 8) & 0xff); + buf[0] = ((add_len >> 8) & 0xff); buf[1] = (add_len & 0xff); buf[2] |= 0x10; /* CRH: Compatible Reservation Hanlding bit. */ buf[2] |= 0x08; /* SIP_C: Specify Initiator Ports Capable bit */ -- cgit v1.2.3 From cc97f1a7c7eed970e674b84be0e68f479c80228d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 Sep 2014 00:42:16 -0400 Subject: jbd2: avoid pointless scanning of checkpoint lists Yuanhan has reported that when he is running fsync(2) heavy workload creating new files over ramdisk, significant amount of time is spent in __jbd2_journal_clean_checkpoint_list() trying to clean old transactions (but they cannot be cleaned up because flusher hasn't yet checkpointed those buffers). The workload can be generated by: fs_mark -d /fs/ram0/1 -D 2 -N 2560 -n 1000000 -L 1 -S 1 -s 4096 Reduce the amount of scanning by stopping to scan the transaction list once we find a transaction that cannot be checkpointed. Note that this way of cleaning is still enough to keep freeing space in the journal after fully checkpointed transactions. Reported-and-tested-by: Yuanhan Liu Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 1fbf59938cc0..3ab4c5ee12ce 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -420,7 +420,6 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * Find all the written-back checkpoint buffers in the given list and * release them. * - * Called with the journal locked. * Called with j_list_lock held. * Returns number of buffers reaped (for debug) */ @@ -440,12 +439,12 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) jh = next_jh; next_jh = jh->b_cpnext; ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } + if (!ret) + return freed; + freed++; + if (ret == 2) { + *released = 1; + return freed; } /* * This function only frees up some memory @@ -465,7 +464,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) * * Find all the written-back checkpoint buffers in the journal and release them. * - * Called with the journal locked. * Called with j_list_lock held. * Returns number of buffers reaped (for debug) */ @@ -473,7 +471,8 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) int __jbd2_journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret = 0; + int ret; + int freed = 0; int released; transaction = journal->j_checkpoint_transactions; @@ -485,17 +484,21 @@ int __jbd2_journal_clean_checkpoint_list(journal_t *journal) do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret += journal_clean_one_cp_list(transaction-> + ret = journal_clean_one_cp_list(transaction-> t_checkpoint_list, &released); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if * preemption requested: */ - if (need_resched()) + if (need_resched()) { + freed += ret; goto out; - if (released) + } + if (released) { + freed += ret; continue; + } /* * It is essential that we are as careful as in the case of * t_checkpoint_list with removing the buffer from the list as @@ -503,11 +506,12 @@ int __jbd2_journal_clean_checkpoint_list(journal_t *journal) */ ret += journal_clean_one_cp_list(transaction-> t_checkpoint_io_list, &released); - if (need_resched()) + freed += ret; + if (need_resched() || !ret) goto out; } while (transaction != last_transaction); out: - return ret; + return freed; } /* -- cgit v1.2.3 From 50849db32a9f529235a84bcc84a6b8e631b1d0ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 Sep 2014 00:58:12 -0400 Subject: jbd2: simplify calling convention around __jbd2_journal_clean_checkpoint_list __jbd2_journal_clean_checkpoint_list() returns number of buffers it freed but noone was using the value so just stop doing that. This also allows for simplifying the calling convention for journal_clean_once_cp_list(). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/jbd2/checkpoint.c | 56 ++++++++++++++++++++++------------------------------ include/linux/jbd2.h | 2 +- 2 files changed, 25 insertions(+), 33 deletions(-) diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 3ab4c5ee12ce..988b32ed4c87 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -421,16 +421,15 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * release them. * * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) + * Returns 1 if we freed the transaction, 0 otherwise. */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +static int journal_clean_one_cp_list(struct journal_head *jh) { struct journal_head *last_jh; struct journal_head *next_jh = jh; - int ret, freed = 0; + int ret; + int freed = 0; - *released = 0; if (!jh) return 0; @@ -441,11 +440,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) ret = __try_to_free_cp_buf(jh); if (!ret) return freed; - freed++; - if (ret == 2) { - *released = 1; - return freed; - } + if (ret == 2) + return 1; + freed = 1; /* * This function only frees up some memory * if possible so we dont have an obligation @@ -465,53 +462,48 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) * Find all the written-back checkpoint buffers in the journal and release them. * * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) */ - -int __jbd2_journal_clean_checkpoint_list(journal_t *journal) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; int ret; - int freed = 0; - int released; transaction = journal->j_checkpoint_transactions; if (!transaction) - goto out; + return; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret = journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); + ret = journal_clean_one_cp_list(transaction->t_checkpoint_list); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if * preemption requested: */ - if (need_resched()) { - freed += ret; - goto out; - } - if (released) { - freed += ret; + if (need_resched()) + return; + if (ret) continue; - } /* * It is essential that we are as careful as in the case of * t_checkpoint_list with removing the buffer from the list as * we can possibly see not yet submitted buffers on io_list */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); - freed += ret; - if (need_resched() || !ret) - goto out; + ret = journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list); + if (need_resched()) + return; + /* + * Stop scanning if we couldn't free the transaction. This + * avoids pointless scanning of transactions which still + * weren't checkpointed. + */ + if (!ret) + return; } while (transaction != last_transaction); -out: - return freed; } /* diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 0dae71e9971c..704b9a599b26 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1042,7 +1042,7 @@ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block); extern void jbd2_journal_commit_transaction(journal_t *); /* Checkpoint list management */ -int __jbd2_journal_clean_checkpoint_list(journal_t *journal); +void __jbd2_journal_clean_checkpoint_list(journal_t *journal); int __jbd2_journal_remove_checkpoint(struct journal_head *); void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); -- cgit v1.2.3 From 279bf6d390933d5353ab298fcc306c391a961469 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 Sep 2014 01:12:15 -0400 Subject: ext4: don't check quota format when there are no quota files The check whether quota format is set even though there are no quota files with journalled quota is pointless and it actually makes it impossible to turn off journalled quotas (as there's no way to unset journalled quota format). Just remove the check. CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 028935fc2760..115e27d855ef 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1712,13 +1712,6 @@ static int parse_options(char *options, struct super_block *sb, "not specified"); return 0; } - } else { - if (sbi->s_jquota_fmt) { - ext4_msg(sb, KERN_ERR, "journaled quota format " - "specified with no journaling " - "enabled"); - return 0; - } } #endif if (test_opt(sb, DIOREAD_NOLOCK)) { -- cgit v1.2.3 From bda3253043c54a705c8352096194ab6216e2e5c1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 18 Sep 2014 16:12:37 -0400 Subject: ext4: fold ext4_sync_fs_nojournal() into ext4_sync_fs() This allows us to eliminate duplicate code, and eventually allow us to also fold ext4_sops and ext4_nojournal_sops together. Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 115e27d855ef..4770c98bdb61 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -70,7 +70,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb, static void ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); -static int ext4_sync_fs_nojournal(struct super_block *sb, int wait); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); @@ -1131,7 +1130,7 @@ static const struct super_operations ext4_nojournal_sops = { .dirty_inode = ext4_dirty_inode, .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, - .sync_fs = ext4_sync_fs_nojournal, + .sync_fs = ext4_sync_fs, .put_super = ext4_put_super, .statfs = ext4_statfs, .remount_fs = ext4_remount, @@ -4718,15 +4717,19 @@ static int ext4_sync_fs(struct super_block *sb, int wait) * being sent at the end of the function. But we can skip it if * transaction_commit will do it for us. */ - target = jbd2_get_latest_transaction(sbi->s_journal); - if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && - !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + if (sbi->s_journal) { + target = jbd2_get_latest_transaction(sbi->s_journal); + if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + needs_barrier = true; + + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { + if (wait) + ret = jbd2_log_wait_commit(sbi->s_journal, + target); + } + } else if (wait && test_opt(sb, BARRIER)) needs_barrier = true; - - if (jbd2_journal_start_commit(sbi->s_journal, &target)) { - if (wait) - ret = jbd2_log_wait_commit(sbi->s_journal, target); - } if (needs_barrier) { int err; err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); @@ -4737,19 +4740,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait) return ret; } -static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) -{ - int ret = 0; - - trace_ext4_sync_fs(sb, wait); - flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); - dquot_writeback_dquots(sb, -1); - if (wait && test_opt(sb, BARRIER)) - ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); - - return ret; -} - /* * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. -- cgit v1.2.3 From bb0445765866e5b1607af81e2f48ca5a8efbeed8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 18 Sep 2014 17:12:02 -0400 Subject: ext4: support freezing ext2 (nojournal) file systems Through an oversight, when we added nojournal support to ext4, we didn't add support to allow file system freezing. This is relatively easy to add, so let's do it. Signed-off-by: Theodore Ts'o Reported-by: Dexuan Cui --- fs/ext4/super.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4770c98bdb61..4db537b3a162 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1131,6 +1131,8 @@ static const struct super_operations ext4_nojournal_sops = { .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .sync_fs = ext4_sync_fs, + .freeze_fs = ext4_freeze, + .unfreeze_fs = ext4_unfreeze, .put_super = ext4_put_super, .statfs = ext4_statfs, .remount_fs = ext4_remount, @@ -4758,23 +4760,26 @@ static int ext4_freeze(struct super_block *sb) journal = EXT4_SB(sb)->s_journal; - /* Now we set up the journal barrier. */ - jbd2_journal_lock_updates(journal); + if (journal) { + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); - /* - * Don't clear the needs_recovery flag if we failed to flush - * the journal. - */ - error = jbd2_journal_flush(journal); - if (error < 0) - goto out; + /* + * Don't clear the needs_recovery flag if we failed to + * flush the journal. + */ + error = jbd2_journal_flush(journal); + if (error < 0) + goto out; + } /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); out: - /* we rely on upper layer to stop further updates */ - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (journal) + /* we rely on upper layer to stop further updates */ + jbd2_journal_unlock_updates(journal); return error; } -- cgit v1.2.3 From f6e63f90809946d410c42045577cb159fedabf8c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 18 Sep 2014 17:12:30 -0400 Subject: ext4: fold ext4_nojournal_sops into ext4_sops There's no longer any need to have a separate set of super_operations for nojournal mode. Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4db537b3a162..1070d6e521c6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1123,27 +1123,6 @@ static const struct super_operations ext4_sops = { .bdev_try_to_free_page = bdev_try_to_free_page, }; -static const struct super_operations ext4_nojournal_sops = { - .alloc_inode = ext4_alloc_inode, - .destroy_inode = ext4_destroy_inode, - .write_inode = ext4_write_inode, - .dirty_inode = ext4_dirty_inode, - .drop_inode = ext4_drop_inode, - .evict_inode = ext4_evict_inode, - .sync_fs = ext4_sync_fs, - .freeze_fs = ext4_freeze, - .unfreeze_fs = ext4_unfreeze, - .put_super = ext4_put_super, - .statfs = ext4_statfs, - .remount_fs = ext4_remount, - .show_options = ext4_show_options, -#ifdef CONFIG_QUOTA - .quota_read = ext4_quota_read, - .quota_write = ext4_quota_write, -#endif - .bdev_try_to_free_page = bdev_try_to_free_page, -}; - static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, @@ -3941,11 +3920,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* * set up enough so that it can read an inode */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; + sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA -- cgit v1.2.3 From 7990da71ebfa887ae6fe4464ab0d99ddeb8efacc Mon Sep 17 00:00:00 2001 From: Tomeu Vizoso Date: Wed, 3 Sep 2014 17:49:32 +0200 Subject: PM / QoS: Add PM_QOS_MEMORY_BANDWIDTH class Also adds a class type PM_QOS_SUM that aggregates the values by summing them. It can be used by memory controllers to calculate the optimum clock frequency based on the bandwidth needs of the different memory clients. Signed-off-by: Tomeu Vizoso Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- Documentation/power/pm_qos_interface.txt | 4 +++- include/linux/pm_qos.h | 5 ++++- kernel/power/qos.c | 27 ++++++++++++++++++++++++++- 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt index a5da5c7e7128..129f7c0e1483 100644 --- a/Documentation/power/pm_qos_interface.txt +++ b/Documentation/power/pm_qos_interface.txt @@ -5,7 +5,8 @@ performance expectations by drivers, subsystems and user space applications on one of the parameters. Two different PM QoS frameworks are available: -1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput. +1. PM QoS classes for cpu_dma_latency, network_latency, network_throughput, +memory_bandwidth. 2. the per-device PM QoS framework provides the API to manage the per-device latency constraints and PM QoS flags. @@ -13,6 +14,7 @@ Each parameters have defined units: * latency: usec * timeout: usec * throughput: kbs (kilo bit / sec) + * memory bandwidth: mbs (mega bit / sec) 1. PM QoS framework diff --git a/include/linux/pm_qos.h b/include/linux/pm_qos.h index 9ab4bf7c4646..636e82834506 100644 --- a/include/linux/pm_qos.h +++ b/include/linux/pm_qos.h @@ -15,6 +15,7 @@ enum { PM_QOS_CPU_DMA_LATENCY, PM_QOS_NETWORK_LATENCY, PM_QOS_NETWORK_THROUGHPUT, + PM_QOS_MEMORY_BANDWIDTH, /* insert new class ID */ PM_QOS_NUM_CLASSES, @@ -32,6 +33,7 @@ enum pm_qos_flags_status { #define PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_LAT_DEFAULT_VALUE (2000 * USEC_PER_SEC) #define PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE 0 +#define PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE 0 #define PM_QOS_RESUME_LATENCY_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_DEFAULT_VALUE 0 #define PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT (-1) @@ -69,7 +71,8 @@ struct dev_pm_qos_request { enum pm_qos_type { PM_QOS_UNITIALIZED, PM_QOS_MAX, /* return the largest value */ - PM_QOS_MIN /* return the smallest value */ + PM_QOS_MIN, /* return the smallest value */ + PM_QOS_SUM /* return the sum */ }; /* diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 884b77058864..5f4c006c4b1e 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = { }; +static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier); +static struct pm_qos_constraints memory_bw_constraints = { + .list = PLIST_HEAD_INIT(memory_bw_constraints.list), + .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE, + .type = PM_QOS_SUM, + .notifiers = &memory_bandwidth_notifier, +}; +static struct pm_qos_object memory_bandwidth_pm_qos = { + .constraints = &memory_bw_constraints, + .name = "memory_bandwidth", +}; + + static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, &network_lat_pm_qos, - &network_throughput_pm_qos + &network_throughput_pm_qos, + &memory_bandwidth_pm_qos, }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, @@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = { /* unlocked internal variant */ static inline int pm_qos_get_value(struct pm_qos_constraints *c) { + struct plist_node *node; + int total_value = 0; + if (plist_head_empty(&c->list)) return c->no_constraint_value; @@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c) case PM_QOS_MAX: return plist_last(&c->list)->prio; + case PM_QOS_SUM: + plist_for_each(node, &c->list) + total_value += node->prio; + + return total_value; + default: /* runtime check for not using enum */ BUG(); -- cgit v1.2.3 From e2e08970100db03bb84fd4a72f9c35bfc18d595a Mon Sep 17 00:00:00 2001 From: Nikolaus Voss Date: Tue, 23 Sep 2014 15:30:21 +0200 Subject: pwm: atmel: Fix calculation of prescale value The prescale value used for calculating the period was incremented afterwards, thus the resulting prescale value is by one too high. This resulted in a PWM frequency only half as high as requested. This patch moves the 64 bit division out of the prescale loop to correct the above issue and make the calculation more efficient. Signed-off-by: Nikolaus Voss Tested-by: Bo Shen Acked-by: Bo Shen Signed-off-by: Thierry Reding --- drivers/pwm/pwm-atmel.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/pwm/pwm-atmel.c b/drivers/pwm/pwm-atmel.c index 6e700a541ca3..d3c22de9ee47 100644 --- a/drivers/pwm/pwm-atmel.c +++ b/drivers/pwm/pwm-atmel.c @@ -102,7 +102,7 @@ static int atmel_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, int duty_ns, int period_ns) { struct atmel_pwm_chip *atmel_pwm = to_atmel_pwm_chip(chip); - unsigned long clk_rate, prd, dty; + unsigned long prd, dty; unsigned long long div; unsigned int pres = 0; u32 val; @@ -113,20 +113,18 @@ static int atmel_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm, return -EBUSY; } - clk_rate = clk_get_rate(atmel_pwm->clk); - div = clk_rate; + /* Calculate the period cycles and prescale value */ + div = (unsigned long long)clk_get_rate(atmel_pwm->clk) * period_ns; + do_div(div, NSEC_PER_SEC); - /* Calculate the period cycles */ while (div > PWM_MAX_PRD) { - div = clk_rate / (1 << pres); - div = div * period_ns; - /* 1/Hz = 100000000 ns */ - do_div(div, 1000000000); - - if (pres++ > PRD_MAX_PRES) { - dev_err(chip->dev, "pres exceeds the maximum value\n"); - return -EINVAL; - } + div >>= 1; + pres++; + } + + if (pres > PRD_MAX_PRES) { + dev_err(chip->dev, "pres exceeds the maximum value\n"); + return -EINVAL; } /* Calculate the duty cycles */ -- cgit v1.2.3 From 619f30188ff0d10fccc3cd952a79cb56ff62db54 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 4 Sep 2014 10:57:33 +0530 Subject: ARC: Allow SMP kernel to build/boot on UP-only infrastructure In light of recent SNAFU with SMP build, allow simple platform to build as SMP but run UP. * Remove the dependence on simulation SMP extension to enable quick build/test iterations of SMP kernel. * In absence of platform SMP registration, prevent the NULL smp feature name from borkign the system Signed-off-by: Vineet Gupta --- arch/arc/include/asm/smp.h | 10 +++++++++- arch/arc/kernel/setup.c | 13 +------------ arch/arc/kernel/smp.c | 2 +- arch/arc/plat-arcfpga/Kconfig | 2 +- arch/arc/plat-arcfpga/platform.c | 2 +- 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/arc/include/asm/smp.h b/arch/arc/include/asm/smp.h index 5d06eee43ea9..3845b9e94f69 100644 --- a/arch/arc/include/asm/smp.h +++ b/arch/arc/include/asm/smp.h @@ -59,7 +59,15 @@ struct plat_smp_ops { /* TBD: stop exporting it for direct population by platform */ extern struct plat_smp_ops plat_smp_ops; -#endif /* CONFIG_SMP */ +#else /* CONFIG_SMP */ + +static inline void smp_init_cpus(void) {} +static inline const char *arc_platform_smp_cpuinfo(void) +{ + return ""; +} + +#endif /* !CONFIG_SMP */ /* * ARC700 doesn't support atomic Read-Modify-Write ops. diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 119dddb752b2..da61f2205dc5 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -24,6 +24,7 @@ #include #include #include +#include #define FIX_PTR(x) __asm__ __volatile__(";" : "+r"(x)) @@ -306,10 +307,7 @@ void setup_processor(void) arc_chk_ccms(); printk(arc_extn_mumbojumbo(cpu_id, str, sizeof(str))); - -#ifdef CONFIG_SMP printk(arc_platform_smp_cpuinfo()); -#endif arc_chk_fpu(); } @@ -360,11 +358,7 @@ void __init setup_arch(char **cmdline_p) machine_desc->init_early(); setup_processor(); - -#ifdef CONFIG_SMP smp_init_cpus(); -#endif - setup_arch_memory(); /* copy flat DT out of .init and then unflatten it */ @@ -424,14 +418,9 @@ static int show_cpuinfo(struct seq_file *m, void *v) (loops_per_jiffy / (5000 / HZ)) % 100); seq_printf(m, arc_mmu_mumbojumbo(cpu_id, str, PAGE_SIZE)); - seq_printf(m, arc_cache_mumbojumbo(cpu_id, str, PAGE_SIZE)); - seq_printf(m, arc_extn_mumbojumbo(cpu_id, str, PAGE_SIZE)); - -#ifdef CONFIG_SMP seq_printf(m, arc_platform_smp_cpuinfo()); -#endif free_page((unsigned long)str); done: diff --git a/arch/arc/kernel/smp.c b/arch/arc/kernel/smp.c index dcd317c47d09..d01df0c517a2 100644 --- a/arch/arc/kernel/smp.c +++ b/arch/arc/kernel/smp.c @@ -101,7 +101,7 @@ void __weak arc_platform_smp_wait_to_boot(int cpu) const char *arc_platform_smp_cpuinfo(void) { - return plat_smp_ops.info; + return plat_smp_ops.info ? : ""; } /* diff --git a/arch/arc/plat-arcfpga/Kconfig b/arch/arc/plat-arcfpga/Kconfig index b9f34cf55acf..4965f9f4ffdc 100644 --- a/arch/arc/plat-arcfpga/Kconfig +++ b/arch/arc/plat-arcfpga/Kconfig @@ -8,7 +8,7 @@ menuconfig ARC_PLAT_FPGA_LEGACY bool "\"Legacy\" ARC FPGA dev Boards" - select ISS_SMP_EXTN if SMP + select ARC_HAS_COH_CACHES if SMP help Support for ARC development boards, provided by Synopsys. These are based on FPGA or ISS. e.g. diff --git a/arch/arc/plat-arcfpga/platform.c b/arch/arc/plat-arcfpga/platform.c index 1038949a99a1..6abc341e276d 100644 --- a/arch/arc/plat-arcfpga/platform.c +++ b/arch/arc/plat-arcfpga/platform.c @@ -71,7 +71,7 @@ MACHINE_START(ML509, "ml509") .dt_compat = ml509_compat, .init_early = plat_fpga_early_init, .init_machine = plat_fpga_populate_dev, -#ifdef CONFIG_SMP +#ifdef CONFIG_ISS_SMP_EXTN .init_smp = iss_model_init_smp, #endif MACHINE_END -- cgit v1.2.3 From 4cc72346f05ef549403d997d66fd517109e59d24 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Sun, 28 Sep 2014 01:57:14 -0700 Subject: led: gpio: Sort include headers alphabetically If the inlcude headers aren't sorted alphabetically, then the logical choice is to append new ones, however that creates a lot of potential for conflicts or duplicates because every change will then add new includes in the same location. Signed-off-by: Xiubo Li Signed-off-by: Bryan Wu --- drivers/leds/leds-gpio-register.c | 2 +- drivers/leds/leds-gpio.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/leds/leds-gpio-register.c b/drivers/leds/leds-gpio-register.c index 1c4ed5510f35..fbd89344bec4 100644 --- a/drivers/leds/leds-gpio-register.c +++ b/drivers/leds/leds-gpio-register.c @@ -7,9 +7,9 @@ * Free Software Foundation. */ #include +#include #include #include -#include /** * gpio_led_register_device - register a gpio-led device diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c index 57ff20fecf57..1b1e6176982d 100644 --- a/drivers/leds/leds-gpio.c +++ b/drivers/leds/leds-gpio.c @@ -10,17 +10,17 @@ * published by the Free Software Foundation. * */ -#include -#include +#include #include +#include #include +#include #include -#include #include +#include +#include #include #include -#include -#include struct gpio_led_data { struct led_classdev cdev; -- cgit v1.2.3 From a823e76138466225d0a9f45520c5654132939a01 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Sun, 28 Sep 2014 01:57:15 -0700 Subject: led: gpio: Fix possible ZERO_SIZE_PTR pointer dereferencing error. Since we cannot make sure the 'pdata->num_leds' will always be none zero here, and then if it equals to zero, the kmemdup() will return ZERO_SIZE_PTR, which equals to ((void *)16). So this patch fix this with just doing the zero check before calling kmemdup(). Signed-off-by: Xiubo Li Signed-off-by: Bryan Wu --- drivers/leds/leds-gpio-register.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/leds/leds-gpio-register.c b/drivers/leds/leds-gpio-register.c index fbd89344bec4..75717ba68ae0 100644 --- a/drivers/leds/leds-gpio-register.c +++ b/drivers/leds/leds-gpio-register.c @@ -28,6 +28,9 @@ struct platform_device *__init gpio_led_register_device( struct platform_device *ret; struct gpio_led_platform_data _pdata = *pdata; + if (!pdata->num_leds) + return ERR_PTR(-EINVAL); + _pdata.leds = kmemdup(pdata->leds, pdata->num_leds * sizeof(*pdata->leds), GFP_KERNEL); if (!_pdata.leds) -- cgit v1.2.3 From a4c84e6aafda0ddd8cb004c464cd11e47e211049 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Sun, 28 Sep 2014 01:57:16 -0700 Subject: leds: gpio: cleanup the leds-gpio driver Remove stray blank line and space. Signed-off-by: Xiubo Li Signed-off-by: Bryan Wu --- drivers/leds/leds-gpio.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c index 1b1e6176982d..b4518c8751c8 100644 --- a/drivers/leds/leds-gpio.c +++ b/drivers/leds/leds-gpio.c @@ -36,7 +36,7 @@ struct gpio_led_data { static void gpio_led_work(struct work_struct *work) { - struct gpio_led_data *led_dat = + struct gpio_led_data *led_dat = container_of(work, struct gpio_led_data, work); if (led_dat->blinking) { @@ -235,14 +235,12 @@ static struct gpio_leds_priv *gpio_leds_create_of(struct platform_device *pdev) } #endif /* CONFIG_OF_GPIO */ - static int gpio_led_probe(struct platform_device *pdev) { struct gpio_led_platform_data *pdata = dev_get_platdata(&pdev->dev); struct gpio_leds_priv *priv; int i, ret = 0; - if (pdata && pdata->num_leds) { priv = devm_kzalloc(&pdev->dev, sizeof_gpio_leds_priv(pdata->num_leds), -- cgit v1.2.3 From cc83881f2c57caaf4b14adaffa65595640a59661 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 30 Jun 2014 16:39:43 -0700 Subject: target: core_tpg_post_dellun can return void Nothing in it can raise an error. Reviewed-by: Christoph Hellwig Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_internal.h | 2 +- drivers/target/target_core_tpg.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index de9cab708f45..463fddcd0ef6 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -83,7 +83,7 @@ struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u32); int core_tpg_add_lun(struct se_portal_group *, struct se_lun *, u32, struct se_device *); struct se_lun *core_tpg_pre_dellun(struct se_portal_group *, u32 unpacked_lun); -int core_tpg_post_dellun(struct se_portal_group *, struct se_lun *); +void core_tpg_post_dellun(struct se_portal_group *, struct se_lun *); /* target_core_transport.c */ extern struct kmem_cache *se_tmr_req_cache; diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index fddfae61222f..b596ab509197 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -867,7 +867,7 @@ struct se_lun *core_tpg_pre_dellun( return lun; } -int core_tpg_post_dellun( +void core_tpg_post_dellun( struct se_portal_group *tpg, struct se_lun *lun) { @@ -881,6 +881,4 @@ int core_tpg_post_dellun( spin_unlock(&tpg->tpg_lun_lock); percpu_ref_exit(&lun->lun_ref); - - return 0; } -- cgit v1.2.3 From cd9d7cbaec8b622eee4edcd8bf481c4047f74915 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 30 Jun 2014 16:39:44 -0700 Subject: target: Change core_dev_del_lun to take a se_lun instead of unpacked_lun Remove core_tpg_pre_dellun entirely, since we don't need to get/check a pointer we already have. Nothing else can return an error, so core_dev_del_lun can return void. Rename core_tpg_post_dellun to remove_lun - a clearer name, now that pre_dellun is gone. Reviewed-by: Christoph Hellwig Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_device.c | 18 ++++---------- drivers/target/target_core_fabric_configfs.c | 2 +- drivers/target/target_core_internal.h | 5 ++-- drivers/target/target_core_tpg.c | 36 +++------------------------- 4 files changed, 11 insertions(+), 50 deletions(-) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index 98da90167159..e784284cbc2e 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -1250,24 +1250,16 @@ struct se_lun *core_dev_add_lun( * * */ -int core_dev_del_lun( +void core_dev_del_lun( struct se_portal_group *tpg, - u32 unpacked_lun) + struct se_lun *lun) { - struct se_lun *lun; - - lun = core_tpg_pre_dellun(tpg, unpacked_lun); - if (IS_ERR(lun)) - return PTR_ERR(lun); - - core_tpg_post_dellun(tpg, lun); - - pr_debug("%s_TPG[%u]_LUN[%u] - Deactivated %s Logical Unit from" + pr_debug("%s_TPG[%u]_LUN[%u] - Deactivating %s Logical Unit from" " device object\n", tpg->se_tpg_tfo->get_fabric_name(), - tpg->se_tpg_tfo->tpg_get_tag(tpg), unpacked_lun, + tpg->se_tpg_tfo->tpg_get_tag(tpg), lun->unpacked_lun, tpg->se_tpg_tfo->get_fabric_name()); - return 0; + core_tpg_remove_lun(tpg, lun); } struct se_lun *core_get_lun_from_tpg(struct se_portal_group *tpg, u32 unpacked_lun) diff --git a/drivers/target/target_core_fabric_configfs.c b/drivers/target/target_core_fabric_configfs.c index dc6c781732ee..0c3f90130b7d 100644 --- a/drivers/target/target_core_fabric_configfs.c +++ b/drivers/target/target_core_fabric_configfs.c @@ -822,7 +822,7 @@ static int target_fabric_port_unlink( tf->tf_ops.fabric_pre_unlink(se_tpg, lun); } - core_dev_del_lun(se_tpg, lun->unpacked_lun); + core_dev_del_lun(se_tpg, lun); return 0; } diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index 463fddcd0ef6..42ef4ab70585 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -46,7 +46,7 @@ int se_dev_set_fabric_max_sectors(struct se_device *, u32); int se_dev_set_optimal_sectors(struct se_device *, u32); int se_dev_set_block_size(struct se_device *, u32); struct se_lun *core_dev_add_lun(struct se_portal_group *, struct se_device *, u32); -int core_dev_del_lun(struct se_portal_group *, u32); +void core_dev_del_lun(struct se_portal_group *, struct se_lun *); struct se_lun *core_get_lun_from_tpg(struct se_portal_group *, u32); struct se_lun_acl *core_dev_init_initiator_node_lun_acl(struct se_portal_group *, struct se_node_acl *, u32, int *); @@ -82,8 +82,7 @@ void core_tpg_wait_for_nacl_pr_ref(struct se_node_acl *); struct se_lun *core_tpg_alloc_lun(struct se_portal_group *, u32); int core_tpg_add_lun(struct se_portal_group *, struct se_lun *, u32, struct se_device *); -struct se_lun *core_tpg_pre_dellun(struct se_portal_group *, u32 unpacked_lun); -void core_tpg_post_dellun(struct se_portal_group *, struct se_lun *); +void core_tpg_remove_lun(struct se_portal_group *, struct se_lun *); /* target_core_transport.c */ extern struct kmem_cache *se_tmr_req_cache; diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index b596ab509197..259604188a90 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -335,7 +335,7 @@ void core_tpg_clear_object_luns(struct se_portal_group *tpg) continue; spin_unlock(&tpg->tpg_lun_lock); - core_dev_del_lun(tpg, lun->unpacked_lun); + core_dev_del_lun(tpg, lun); spin_lock(&tpg->tpg_lun_lock); } spin_unlock(&tpg->tpg_lun_lock); @@ -667,7 +667,7 @@ static void core_tpg_release_virtual_lun0(struct se_portal_group *se_tpg) { struct se_lun *lun = &se_tpg->tpg_virt_lun0; - core_tpg_post_dellun(se_tpg, lun); + core_tpg_remove_lun(se_tpg, lun); } int core_tpg_register( @@ -837,37 +837,7 @@ int core_tpg_add_lun( return 0; } -struct se_lun *core_tpg_pre_dellun( - struct se_portal_group *tpg, - u32 unpacked_lun) -{ - struct se_lun *lun; - - if (unpacked_lun > (TRANSPORT_MAX_LUNS_PER_TPG-1)) { - pr_err("%s LUN: %u exceeds TRANSPORT_MAX_LUNS_PER_TPG" - "-1: %u for Target Portal Group: %u\n", - tpg->se_tpg_tfo->get_fabric_name(), unpacked_lun, - TRANSPORT_MAX_LUNS_PER_TPG-1, - tpg->se_tpg_tfo->tpg_get_tag(tpg)); - return ERR_PTR(-EOVERFLOW); - } - - spin_lock(&tpg->tpg_lun_lock); - lun = tpg->tpg_lun_list[unpacked_lun]; - if (lun->lun_status != TRANSPORT_LUN_STATUS_ACTIVE) { - pr_err("%s Logical Unit Number: %u is not active on" - " Target Portal Group: %u, ignoring request.\n", - tpg->se_tpg_tfo->get_fabric_name(), unpacked_lun, - tpg->se_tpg_tfo->tpg_get_tag(tpg)); - spin_unlock(&tpg->tpg_lun_lock); - return ERR_PTR(-ENODEV); - } - spin_unlock(&tpg->tpg_lun_lock); - - return lun; -} - -void core_tpg_post_dellun( +void core_tpg_remove_lun( struct se_portal_group *tpg, struct se_lun *lun) { -- cgit v1.2.3 From 9c7d6154bc4b9dfefd580490cdca5f7c72321464 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 30 Jun 2014 16:39:46 -0700 Subject: target: Remove core_tpg_release_virtual_lun0 function Simple and just called from one place. Reviewed-by: Christoph Hellwig Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_tpg.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index 259604188a90..6ec58736d1a4 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -663,13 +663,6 @@ static int core_tpg_setup_virtual_lun0(struct se_portal_group *se_tpg) return 0; } -static void core_tpg_release_virtual_lun0(struct se_portal_group *se_tpg) -{ - struct se_lun *lun = &se_tpg->tpg_virt_lun0; - - core_tpg_remove_lun(se_tpg, lun); -} - int core_tpg_register( struct target_core_fabric_ops *tfo, struct se_wwn *se_wwn, @@ -773,7 +766,7 @@ int core_tpg_deregister(struct se_portal_group *se_tpg) spin_unlock_irq(&se_tpg->acl_node_lock); if (se_tpg->se_tpg_type == TRANSPORT_TPG_TYPE_NORMAL) - core_tpg_release_virtual_lun0(se_tpg); + core_tpg_remove_lun(se_tpg, &se_tpg->tpg_virt_lun0); se_tpg->se_tpg_fabric_ptr = NULL; array_free(se_tpg->tpg_lun_list, TRANSPORT_MAX_LUNS_PER_TPG); -- cgit v1.2.3 From 8f83269048628d7b139dacbfc6cc97befcbdd2e9 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 16 Sep 2014 16:23:10 -0400 Subject: target: simplify core_tmr_release_req() And while at it, do minimal coding style fixes in the area. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_tmr.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c index f7cd95e8111a..83de7aec4aac 100644 --- a/drivers/target/target_core_tmr.c +++ b/drivers/target/target_core_tmr.c @@ -64,21 +64,17 @@ int core_tmr_alloc_req( } EXPORT_SYMBOL(core_tmr_alloc_req); -void core_tmr_release_req( - struct se_tmr_req *tmr) +void core_tmr_release_req(struct se_tmr_req *tmr) { struct se_device *dev = tmr->tmr_dev; unsigned long flags; - if (!dev) { - kfree(tmr); - return; + if (dev) { + spin_lock_irqsave(&dev->se_tmr_lock, flags); + list_del(&tmr->tmr_list); + spin_unlock_irqrestore(&dev->se_tmr_lock, flags); } - spin_lock_irqsave(&dev->se_tmr_lock, flags); - list_del(&tmr->tmr_list); - spin_unlock_irqrestore(&dev->se_tmr_lock, flags); - kfree(tmr); } @@ -90,9 +86,8 @@ static void core_tmr_handle_tas_abort( bool remove = true; /* * TASK ABORTED status (TAS) bit support - */ - if ((tmr_nacl && - (tmr_nacl != cmd->se_sess->se_node_acl)) && tas) { + */ + if ((tmr_nacl && (tmr_nacl != cmd->se_sess->se_node_acl)) && tas) { remove = false; transport_send_task_abort(cmd); } -- cgit v1.2.3 From 74ed7e62289dc6d388996d7c8f89c2e7e95b9657 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 16 Sep 2014 16:23:11 -0400 Subject: target: remove some smp_mb__after_atomic()s atomic_inc_return() already does an implicit memory barrier and the second case was moved from an atomic to a plain flag operation. If a barrier were needed in the second case, it would have to be smp_mb(), not a variant optimized away for x86 and other architectures. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 7fa62fc93e0b..0b43761ed85f 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1166,7 +1166,6 @@ transport_check_alloc_task_attr(struct se_cmd *cmd) * Dormant to Active status. */ cmd->se_ordered_id = atomic_inc_return(&dev->dev_ordered_id); - smp_mb__after_atomic(); pr_debug("Allocated se_ordered_id: %u for Task Attr: 0x%02x on %s\n", cmd->se_ordered_id, cmd->sam_task_attr, dev->transport->name); @@ -2896,7 +2895,6 @@ void transport_send_task_abort(struct se_cmd *cmd) if (cmd->se_tfo->write_pending_status(cmd) != 0) { cmd->transport_state |= CMD_T_ABORTED; cmd->se_cmd_flags |= SCF_SEND_DELAYED_TAS; - smp_mb__after_atomic(); return; } } -- cgit v1.2.3 From 33940d09937276cd3c81f2874faf43e37c2db0e2 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 16 Sep 2014 16:23:12 -0400 Subject: target: encapsulate smp_mb__after_atomic() The target code has a rather generous helping of smp_mb__after_atomic() throughout the code base. Most atomic operations were followed by one and none were preceded by smp_mb__before_atomic(), nor accompanied by a comment explaining the need for a barrier. Instead of trying to prove for every case whether or not it is needed, this patch introduces atomic_inc_mb() and atomic_dec_mb(), which explicitly include the memory barriers before and after the atomic operation. For now they are defined in a target header, although they could be of general use. Most of the existing atomic/mb combinations were replaced by the new helpers. In a few cases the atomic was sandwiched in spin_lock/spin_unlock and I simply removed the barrier. I suspect that in most cases the correct conversion would have been to drop the barrier. I also suspect that a few cases exist where a) the barrier was necessary and b) a second barrier before the atomic would have been necessary and got added by this patch. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/loopback/tcm_loop.c | 6 +-- drivers/target/target_core_alua.c | 33 ++++--------- drivers/target/target_core_device.c | 9 ++-- drivers/target/target_core_pr.c | 90 ++++++++++++---------------------- drivers/target/target_core_transport.c | 18 +++---- drivers/target/target_core_ua.c | 15 ++---- include/target/target_core_base.h | 14 ++++++ 7 files changed, 70 insertions(+), 115 deletions(-) diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index 340de9d92b15..a7f6dc646045 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -960,8 +960,7 @@ static int tcm_loop_port_link( struct tcm_loop_tpg, tl_se_tpg); struct tcm_loop_hba *tl_hba = tl_tpg->tl_hba; - atomic_inc(&tl_tpg->tl_tpg_port_count); - smp_mb__after_atomic(); + atomic_inc_mb(&tl_tpg->tl_tpg_port_count); /* * Add Linux/SCSI struct scsi_device by HCTL */ @@ -995,8 +994,7 @@ static void tcm_loop_port_unlink( scsi_remove_device(sd); scsi_device_put(sd); - atomic_dec(&tl_tpg->tl_tpg_port_count); - smp_mb__after_atomic(); + atomic_dec_mb(&tl_tpg->tl_tpg_port_count); pr_debug("TCM_Loop_ConfigFS: Port Unlink Successful\n"); } diff --git a/drivers/target/target_core_alua.c b/drivers/target/target_core_alua.c index fbc5ebb5f761..fb87780929d2 100644 --- a/drivers/target/target_core_alua.c +++ b/drivers/target/target_core_alua.c @@ -392,8 +392,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) if (tg_pt_id != tg_pt_gp->tg_pt_gp_id) continue; - atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); @@ -403,8 +402,7 @@ target_emulate_set_target_port_groups(struct se_cmd *cmd) found = true; spin_lock(&dev->t10_alua.tg_pt_gps_lock); - atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); break; } spin_unlock(&dev->t10_alua.tg_pt_gps_lock); @@ -998,8 +996,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) * every I_T nexus other than the I_T nexus on which the SET * TARGET PORT GROUPS command */ - atomic_inc(&mem->tg_pt_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&mem->tg_pt_gp_mem_ref_cnt); spin_unlock(&tg_pt_gp->tg_pt_gp_lock); spin_lock_bh(&port->sep_alua_lock); @@ -1028,8 +1025,7 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) spin_unlock_bh(&port->sep_alua_lock); spin_lock(&tg_pt_gp->tg_pt_gp_lock); - atomic_dec(&mem->tg_pt_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&mem->tg_pt_gp_mem_ref_cnt); } spin_unlock(&tg_pt_gp->tg_pt_gp_lock); /* @@ -1063,7 +1059,6 @@ static void core_alua_do_transition_tg_pt_work(struct work_struct *work) core_alua_dump_state(tg_pt_gp->tg_pt_gp_alua_pending_state)); spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); if (tg_pt_gp->tg_pt_gp_transition_complete) @@ -1125,7 +1120,6 @@ static int core_alua_do_transition_tg_pt( */ spin_lock(&dev->t10_alua.tg_pt_gps_lock); atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); if (!explicit && tg_pt_gp->tg_pt_gp_implicit_trans_secs) { @@ -1168,7 +1162,6 @@ int core_alua_do_port_transition( spin_lock(&local_lu_gp_mem->lu_gp_mem_lock); lu_gp = local_lu_gp_mem->lu_gp; atomic_inc(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic(); spin_unlock(&local_lu_gp_mem->lu_gp_mem_lock); /* * For storage objects that are members of the 'default_lu_gp', @@ -1184,8 +1177,7 @@ int core_alua_do_port_transition( l_tg_pt_gp->tg_pt_gp_alua_nacl = l_nacl; rc = core_alua_do_transition_tg_pt(l_tg_pt_gp, new_state, explicit); - atomic_dec(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&lu_gp->lu_gp_ref_cnt); return rc; } /* @@ -1198,8 +1190,7 @@ int core_alua_do_port_transition( lu_gp_mem_list) { dev = lu_gp_mem->lu_gp_mem_dev; - atomic_inc(&lu_gp_mem->lu_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&lu_gp_mem->lu_gp_mem_ref_cnt); spin_unlock(&lu_gp->lu_gp_lock); spin_lock(&dev->t10_alua.tg_pt_gps_lock); @@ -1227,8 +1218,7 @@ int core_alua_do_port_transition( tg_pt_gp->tg_pt_gp_alua_port = NULL; tg_pt_gp->tg_pt_gp_alua_nacl = NULL; } - atomic_inc(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); spin_unlock(&dev->t10_alua.tg_pt_gps_lock); /* * core_alua_do_transition_tg_pt() will always return @@ -1238,16 +1228,14 @@ int core_alua_do_port_transition( new_state, explicit); spin_lock(&dev->t10_alua.tg_pt_gps_lock); - atomic_dec(&tg_pt_gp->tg_pt_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&tg_pt_gp->tg_pt_gp_ref_cnt); if (rc) break; } spin_unlock(&dev->t10_alua.tg_pt_gps_lock); spin_lock(&lu_gp->lu_gp_lock); - atomic_dec(&lu_gp_mem->lu_gp_mem_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&lu_gp_mem->lu_gp_mem_ref_cnt); } spin_unlock(&lu_gp->lu_gp_lock); @@ -1260,8 +1248,7 @@ int core_alua_do_port_transition( core_alua_dump_state(new_state)); } - atomic_dec(&lu_gp->lu_gp_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&lu_gp->lu_gp_ref_cnt); return rc; } diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index e784284cbc2e..f5057a2f4ed1 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -224,8 +224,7 @@ struct se_dev_entry *core_get_se_deve_from_rtpi( if (port->sep_rtpi != rtpi) continue; - atomic_inc(&deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve->pr_ref_count); spin_unlock_irq(&nacl->device_list_lock); return deve; @@ -1388,8 +1387,7 @@ int core_dev_add_initiator_node_lun_acl( spin_lock(&lun->lun_acl_lock); list_add_tail(&lacl->lacl_list, &lun->lun_acl_list); - atomic_inc(&lun->lun_acl_count); - smp_mb__after_atomic(); + atomic_inc_mb(&lun->lun_acl_count); spin_unlock(&lun->lun_acl_lock); pr_debug("%s_TPG[%hu]_LUN[%u->%u] - Added %s ACL for " @@ -1422,8 +1420,7 @@ int core_dev_del_initiator_node_lun_acl( spin_lock(&lun->lun_acl_lock); list_del(&lacl->lacl_list); - atomic_dec(&lun->lun_acl_count); - smp_mb__after_atomic(); + atomic_dec_mb(&lun->lun_acl_count); spin_unlock(&lun->lun_acl_lock); core_disable_device_list_for_node(lun, NULL, lacl->mapped_lun, diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 281d52e3fe99..48a801045176 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -674,8 +674,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( */ spin_lock(&dev->se_port_lock); list_for_each_entry_safe(port, port_tmp, &dev->dev_sep_list, sep_list) { - atomic_inc(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); + atomic_inc_mb(&port->sep_tg_pt_ref_cnt); spin_unlock(&dev->se_port_lock); spin_lock_bh(&port->sep_alua_lock); @@ -709,8 +708,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( if (strcmp(nacl->initiatorname, nacl_tmp->initiatorname)) continue; - atomic_inc(&deve_tmp->pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve_tmp->pr_ref_count); spin_unlock_bh(&port->sep_alua_lock); /* * Grab a configfs group dependency that is released @@ -722,10 +720,8 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( if (ret < 0) { pr_err("core_scsi3_lunacl_depend" "_item() failed\n"); - atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); - atomic_dec(&deve_tmp->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&port->sep_tg_pt_ref_cnt); + atomic_dec_mb(&deve_tmp->pr_ref_count); goto out; } /* @@ -739,10 +735,8 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( nacl_tmp, deve_tmp, NULL, sa_res_key, all_tg_pt, aptpl); if (!pr_reg_atp) { - atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); - atomic_dec(&deve_tmp->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&port->sep_tg_pt_ref_cnt); + atomic_dec_mb(&deve_tmp->pr_ref_count); core_scsi3_lunacl_undepend_item(deve_tmp); goto out; } @@ -754,8 +748,7 @@ static struct t10_pr_registration *__core_scsi3_alloc_registration( spin_unlock_bh(&port->sep_alua_lock); spin_lock(&dev->se_port_lock); - atomic_dec(&port->sep_tg_pt_ref_cnt); - smp_mb__after_atomic(); + atomic_dec_mb(&port->sep_tg_pt_ref_cnt); } spin_unlock(&dev->se_port_lock); @@ -1109,8 +1102,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg( if (dev->dev_attrib.enforce_pr_isids) continue; } - atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_inc_mb(&pr_reg->pr_res_holders); spin_unlock(&pr_tmpl->registration_lock); return pr_reg; } @@ -1124,8 +1116,7 @@ static struct t10_pr_registration *__core_scsi3_locate_pr_reg( if (strcmp(isid, pr_reg->pr_reg_isid)) continue; - atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_inc_mb(&pr_reg->pr_res_holders); spin_unlock(&pr_tmpl->registration_lock); return pr_reg; } @@ -1154,8 +1145,7 @@ static struct t10_pr_registration *core_scsi3_locate_pr_reg( static void core_scsi3_put_pr_reg(struct t10_pr_registration *pr_reg) { - atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_dec_mb(&pr_reg->pr_res_holders); } static int core_scsi3_check_implicit_release( @@ -1348,8 +1338,7 @@ static void core_scsi3_tpg_undepend_item(struct se_portal_group *tpg) configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys, &tpg->tpg_group.cg_item); - atomic_dec(&tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&tpg->tpg_pr_ref_count); } static int core_scsi3_nodeacl_depend_item(struct se_node_acl *nacl) @@ -1368,16 +1357,14 @@ static void core_scsi3_nodeacl_undepend_item(struct se_node_acl *nacl) struct se_portal_group *tpg = nacl->se_tpg; if (nacl->dynamic_node_acl) { - atomic_dec(&nacl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&nacl->acl_pr_ref_count); return; } configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys, &nacl->acl_group.cg_item); - atomic_dec(&nacl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&nacl->acl_pr_ref_count); } static int core_scsi3_lunacl_depend_item(struct se_dev_entry *se_deve) @@ -1407,8 +1394,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) * For nacl->dynamic_node_acl=1 */ if (!lun_acl) { - atomic_dec(&se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&se_deve->pr_ref_count); return; } nacl = lun_acl->se_lun_nacl; @@ -1417,8 +1403,7 @@ static void core_scsi3_lunacl_undepend_item(struct se_dev_entry *se_deve) configfs_undepend_item(tpg->se_tpg_tfo->tf_subsys, &lun_acl->se_lun_group.cg_item); - atomic_dec(&se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&se_deve->pr_ref_count); } static sense_reason_t @@ -1551,15 +1536,13 @@ core_scsi3_decode_spec_i_port( if (!i_str) continue; - atomic_inc(&tmp_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&tmp_tpg->tpg_pr_ref_count); spin_unlock(&dev->se_port_lock); if (core_scsi3_tpg_depend_item(tmp_tpg)) { pr_err(" core_scsi3_tpg_depend_item()" " for tmp_tpg\n"); - atomic_dec(&tmp_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&tmp_tpg->tpg_pr_ref_count); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_unmap; } @@ -1571,10 +1554,8 @@ core_scsi3_decode_spec_i_port( spin_lock_irq(&tmp_tpg->acl_node_lock); dest_node_acl = __core_tpg_get_initiator_node_acl( tmp_tpg, i_str); - if (dest_node_acl) { - atomic_inc(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); - } + if (dest_node_acl) + atomic_inc_mb(&dest_node_acl->acl_pr_ref_count); spin_unlock_irq(&tmp_tpg->acl_node_lock); if (!dest_node_acl) { @@ -1586,8 +1567,7 @@ core_scsi3_decode_spec_i_port( if (core_scsi3_nodeacl_depend_item(dest_node_acl)) { pr_err("configfs_depend_item() failed" " for dest_node_acl->acl_group\n"); - atomic_dec(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_node_acl->acl_pr_ref_count); core_scsi3_tpg_undepend_item(tmp_tpg); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_unmap; @@ -1646,8 +1626,7 @@ core_scsi3_decode_spec_i_port( if (core_scsi3_lunacl_depend_item(dest_se_deve)) { pr_err("core_scsi3_lunacl_depend_item()" " failed\n"); - atomic_dec(&dest_se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_se_deve->pr_ref_count); core_scsi3_nodeacl_undepend_item(dest_node_acl); core_scsi3_tpg_undepend_item(dest_tpg); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; @@ -3167,15 +3146,13 @@ core_scsi3_emulate_pro_register_and_move(struct se_cmd *cmd, u64 res_key, if (!dest_tf_ops) continue; - atomic_inc(&dest_se_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_inc_mb(&dest_se_tpg->tpg_pr_ref_count); spin_unlock(&dev->se_port_lock); if (core_scsi3_tpg_depend_item(dest_se_tpg)) { pr_err("core_scsi3_tpg_depend_item() failed" " for dest_se_tpg\n"); - atomic_dec(&dest_se_tpg->tpg_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_se_tpg->tpg_pr_ref_count); ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out_put_pr_reg; } @@ -3271,10 +3248,8 @@ after_iport_check: spin_lock_irq(&dest_se_tpg->acl_node_lock); dest_node_acl = __core_tpg_get_initiator_node_acl(dest_se_tpg, initiator_str); - if (dest_node_acl) { - atomic_inc(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); - } + if (dest_node_acl) + atomic_inc_mb(&dest_node_acl->acl_pr_ref_count); spin_unlock_irq(&dest_se_tpg->acl_node_lock); if (!dest_node_acl) { @@ -3288,8 +3263,7 @@ after_iport_check: if (core_scsi3_nodeacl_depend_item(dest_node_acl)) { pr_err("core_scsi3_nodeacl_depend_item() for" " dest_node_acl\n"); - atomic_dec(&dest_node_acl->acl_pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_node_acl->acl_pr_ref_count); dest_node_acl = NULL; ret = TCM_INVALID_PARAMETER_LIST; goto out; @@ -3313,8 +3287,7 @@ after_iport_check: if (core_scsi3_lunacl_depend_item(dest_se_deve)) { pr_err("core_scsi3_lunacl_depend_item() failed\n"); - atomic_dec(&dest_se_deve->pr_ref_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dest_se_deve->pr_ref_count); dest_se_deve = NULL; ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; goto out; @@ -3879,8 +3852,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) se_tpg = pr_reg->pr_reg_nacl->se_tpg; add_desc_len = 0; - atomic_inc(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_inc_mb(&pr_reg->pr_res_holders); spin_unlock(&pr_tmpl->registration_lock); /* * Determine expected length of $FABRIC_MOD specific @@ -3893,8 +3865,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) pr_warn("SPC-3 PRIN READ_FULL_STATUS ran" " out of buffer: %d\n", cmd->data_length); spin_lock(&pr_tmpl->registration_lock); - atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_dec_mb(&pr_reg->pr_res_holders); break; } /* @@ -3955,8 +3926,7 @@ core_scsi3_pri_read_full_status(struct se_cmd *cmd) se_nacl, pr_reg, &format_code, &buf[off+4]); spin_lock(&pr_tmpl->registration_lock); - atomic_dec(&pr_reg->pr_res_holders); - smp_mb__after_atomic(); + atomic_dec_mb(&pr_reg->pr_res_holders); /* * Set the ADDITIONAL DESCRIPTOR LENGTH */ diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 0b43761ed85f..115632ee3ec8 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -752,8 +752,7 @@ void target_qf_do_work(struct work_struct *work) list_for_each_entry_safe(cmd, cmd_tmp, &qf_cmd_list, se_qf_node) { list_del(&cmd->se_qf_node); - atomic_dec(&dev->dev_qf_count); - smp_mb__after_atomic(); + atomic_dec_mb(&dev->dev_qf_count); pr_debug("Processing %s cmd: %p QUEUE_FULL in work queue" " context: %s\n", cmd->se_tfo->get_fabric_name(), cmd, @@ -1721,8 +1720,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd) cmd->t_task_cdb[0], cmd->se_ordered_id); return false; case MSG_ORDERED_TAG: - atomic_inc(&dev->dev_ordered_sync); - smp_mb__after_atomic(); + atomic_inc_mb(&dev->dev_ordered_sync); pr_debug("Added ORDERED for CDB: 0x%02x to ordered list, " " se_ordered_id: %u\n", @@ -1739,8 +1737,7 @@ static bool target_handle_task_attr(struct se_cmd *cmd) /* * For SIMPLE and UNTAGGED Task Attribute commands */ - atomic_inc(&dev->simple_cmds); - smp_mb__after_atomic(); + atomic_inc_mb(&dev->simple_cmds); break; } @@ -1844,8 +1841,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) return; if (cmd->sam_task_attr == MSG_SIMPLE_TAG) { - atomic_dec(&dev->simple_cmds); - smp_mb__after_atomic(); + atomic_dec_mb(&dev->simple_cmds); dev->dev_cur_ordered_id++; pr_debug("Incremented dev->dev_cur_ordered_id: %u for" " SIMPLE: %u\n", dev->dev_cur_ordered_id, @@ -1856,8 +1852,7 @@ static void transport_complete_task_attr(struct se_cmd *cmd) " HEAD_OF_QUEUE: %u\n", dev->dev_cur_ordered_id, cmd->se_ordered_id); } else if (cmd->sam_task_attr == MSG_ORDERED_TAG) { - atomic_dec(&dev->dev_ordered_sync); - smp_mb__after_atomic(); + atomic_dec_mb(&dev->dev_ordered_sync); dev->dev_cur_ordered_id++; pr_debug("Incremented dev_cur_ordered_id: %u for ORDERED:" @@ -1915,8 +1910,7 @@ static void transport_handle_queue_full( { spin_lock_irq(&dev->qf_cmd_lock); list_add_tail(&cmd->se_qf_node, &cmd->se_dev->qf_cmd_list); - atomic_inc(&dev->dev_qf_count); - smp_mb__after_atomic(); + atomic_inc_mb(&dev->dev_qf_count); spin_unlock_irq(&cmd->se_dev->qf_cmd_lock); schedule_work(&cmd->se_dev->qf_work_queue); diff --git a/drivers/target/target_core_ua.c b/drivers/target/target_core_ua.c index 101858e245b3..1738b1646988 100644 --- a/drivers/target/target_core_ua.c +++ b/drivers/target/target_core_ua.c @@ -161,8 +161,7 @@ int core_scsi3_ua_allocate( spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); - atomic_inc(&deve->ua_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve->ua_count); return 0; } list_add_tail(&ua->ua_nacl_list, &deve->ua_list); @@ -174,8 +173,7 @@ int core_scsi3_ua_allocate( nacl->se_tpg->se_tpg_tfo->get_fabric_name(), unpacked_lun, asc, ascq); - atomic_inc(&deve->ua_count); - smp_mb__after_atomic(); + atomic_inc_mb(&deve->ua_count); return 0; } @@ -189,8 +187,7 @@ void core_scsi3_ua_release_all( list_del(&ua->ua_nacl_list); kmem_cache_free(se_ua_cache, ua); - atomic_dec(&deve->ua_count); - smp_mb__after_atomic(); + atomic_dec_mb(&deve->ua_count); } spin_unlock(&deve->ua_lock); } @@ -250,8 +247,7 @@ void core_scsi3_ua_for_check_condition( list_del(&ua->ua_nacl_list); kmem_cache_free(se_ua_cache, ua); - atomic_dec(&deve->ua_count); - smp_mb__after_atomic(); + atomic_dec_mb(&deve->ua_count); } spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); @@ -309,8 +305,7 @@ int core_scsi3_ua_clear_for_request_sense( list_del(&ua->ua_nacl_list); kmem_cache_free(se_ua_cache, ua); - atomic_dec(&deve->ua_count); - smp_mb__after_atomic(); + atomic_dec_mb(&deve->ua_count); } spin_unlock(&deve->ua_lock); spin_unlock_irq(&nacl->device_list_lock); diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index 9ec9864ecf38..b106240d8385 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -903,4 +903,18 @@ struct se_wwn { struct config_group fabric_stat_group; }; +static inline void atomic_inc_mb(atomic_t *v) +{ + smp_mb__before_atomic(); + atomic_inc(v); + smp_mb__after_atomic(); +} + +static inline void atomic_dec_mb(atomic_t *v) +{ + smp_mb__before_atomic(); + atomic_dec(v); + smp_mb__after_atomic(); +} + #endif /* TARGET_CORE_BASE_H */ -- cgit v1.2.3 From f81ccb489a7a641c1bed41b49cf8d72c199c68d5 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 16 Sep 2014 16:23:13 -0400 Subject: target: simplify core_tmr_abort_task list_for_each_entry_safe is necessary if list objects are deleted from the list while traversing it. Not the case here, so we can use the base list_for_each_entry variant. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_tmr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/target/target_core_tmr.c b/drivers/target/target_core_tmr.c index 83de7aec4aac..fa5e157db47b 100644 --- a/drivers/target/target_core_tmr.c +++ b/drivers/target/target_core_tmr.c @@ -115,13 +115,12 @@ void core_tmr_abort_task( struct se_tmr_req *tmr, struct se_session *se_sess) { - struct se_cmd *se_cmd, *tmp_cmd; + struct se_cmd *se_cmd; unsigned long flags; int ref_tag; spin_lock_irqsave(&se_sess->sess_cmd_lock, flags); - list_for_each_entry_safe(se_cmd, tmp_cmd, - &se_sess->sess_cmd_list, se_cmd_list) { + list_for_each_entry(se_cmd, &se_sess->sess_cmd_list, se_cmd_list) { if (dev != se_cmd->se_dev) continue; -- cgit v1.2.3 From c57010420654aca179c500f61e86315a337244ca Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 16 Sep 2014 16:23:14 -0400 Subject: qla_target: remove unused parameter Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_target.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index e632e14180cf..577058e4537f 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -401,7 +401,7 @@ static int qlt_reset(struct scsi_qla_host *vha, void *iocb, int mcmd) #if 0 /* FIXME: Re-enable Global event handling.. */ /* Global event */ atomic_inc(&ha->tgt.qla_tgt->tgt_global_resets_count); - qlt_clear_tgt_db(ha->tgt.qla_tgt, 1); + qlt_clear_tgt_db(ha->tgt.qla_tgt); if (!list_empty(&ha->tgt.qla_tgt->sess_list)) { sess = list_entry(ha->tgt.qla_tgt->sess_list.next, typeof(*sess), sess_list_entry); @@ -483,7 +483,7 @@ static void qlt_schedule_sess_for_deletion(struct qla_tgt_sess *sess, } /* ha->hardware_lock supposed to be held on entry */ -static void qlt_clear_tgt_db(struct qla_tgt *tgt, bool local_only) +static void qlt_clear_tgt_db(struct qla_tgt *tgt) { struct qla_tgt_sess *sess; @@ -835,7 +835,7 @@ int qlt_stop_phase1(struct qla_tgt *tgt) mutex_lock(&vha->vha_tgt.tgt_mutex); spin_lock_irqsave(&ha->hardware_lock, flags); tgt->tgt_stop = 1; - qlt_clear_tgt_db(tgt, true); + qlt_clear_tgt_db(tgt); spin_unlock_irqrestore(&ha->hardware_lock, flags); mutex_unlock(&vha->vha_tgt.tgt_mutex); mutex_unlock(&qla_tgt_mutex); -- cgit v1.2.3 From 55a9066fffd2f533e7ed434b072469ef09d6c476 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 16 Sep 2014 16:23:15 -0400 Subject: qla_target: make some global functions static Also removes the declarations from the header - including two declarations without function definitions or callers. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_target.c | 10 +++++----- drivers/scsi/qla2xxx/qla_target.h | 10 ---------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index 577058e4537f..c22b1c141fec 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -101,6 +101,7 @@ static void qlt_send_term_exchange(struct scsi_qla_host *ha, struct qla_tgt_cmd *cmd, struct atio_from_isp *atio, int ha_locked); static void qlt_reject_free_srr_imm(struct scsi_qla_host *ha, struct qla_tgt_srr_imm *imm, int ha_lock); +static void qlt_disable_vha(struct scsi_qla_host *vha); /* * Global Variables */ @@ -178,7 +179,7 @@ struct scsi_qla_host *qlt_find_host_by_vp_idx(struct scsi_qla_host *vha, return NULL; } -void qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *vha, +static void qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *vha, struct atio_from_isp *atio) { ql_dbg(ql_dbg_tgt, vha, 0xe072, @@ -5024,7 +5025,7 @@ void qlt_lport_deregister(struct scsi_qla_host *vha) EXPORT_SYMBOL(qlt_lport_deregister); /* Must be called under HW lock */ -void qlt_set_mode(struct scsi_qla_host *vha) +static void qlt_set_mode(struct scsi_qla_host *vha) { struct qla_hw_data *ha = vha->hw; @@ -5045,7 +5046,7 @@ void qlt_set_mode(struct scsi_qla_host *vha) } /* Must be called under HW lock */ -void qlt_clear_mode(struct scsi_qla_host *vha) +static void qlt_clear_mode(struct scsi_qla_host *vha) { struct qla_hw_data *ha = vha->hw; @@ -5109,8 +5110,7 @@ EXPORT_SYMBOL(qlt_enable_vha); * * Disable Target Mode and reset the adapter */ -void -qlt_disable_vha(struct scsi_qla_host *vha) +static void qlt_disable_vha(struct scsi_qla_host *vha) { struct qla_hw_data *ha = vha->hw; struct qla_tgt *tgt = vha->vha_tgt.qla_tgt; diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h index d1d24fb0160a..bc1aee78d934 100644 --- a/drivers/scsi/qla2xxx/qla_target.h +++ b/drivers/scsi/qla2xxx/qla_target.h @@ -1003,10 +1003,6 @@ struct qla_tgt_srr_ctio { extern struct qla_tgt_data qla_target; -/* - * Internal function prototypes - */ -void qlt_disable_vha(struct scsi_qla_host *); /* * Function prototypes for qla_target.c logic used by qla2xxx LLD code. @@ -1019,8 +1015,6 @@ extern void qlt_lport_deregister(struct scsi_qla_host *); extern void qlt_unreg_sess(struct qla_tgt_sess *); extern void qlt_fc_port_added(struct scsi_qla_host *, fc_port_t *); extern void qlt_fc_port_deleted(struct scsi_qla_host *, fc_port_t *); -extern void qlt_set_mode(struct scsi_qla_host *ha); -extern void qlt_clear_mode(struct scsi_qla_host *ha); extern int __init qlt_init(void); extern void qlt_exit(void); extern void qlt_update_vp_map(struct scsi_qla_host *, int); @@ -1053,13 +1047,9 @@ static inline void qla_reverse_ini_mode(struct scsi_qla_host *ha) /* * Exported symbols from qla_target.c LLD logic used by qla2xxx code.. */ -extern void qlt_24xx_atio_pkt_all_vps(struct scsi_qla_host *, - struct atio_from_isp *); extern void qlt_response_pkt_all_vps(struct scsi_qla_host *, response_t *); extern int qlt_rdy_to_xfer(struct qla_tgt_cmd *); extern int qlt_xmit_response(struct qla_tgt_cmd *, int, uint8_t); -extern int qlt_rdy_to_xfer_dif(struct qla_tgt_cmd *); -extern int qlt_xmit_response_dif(struct qla_tgt_cmd *, int, uint8_t); extern void qlt_xmit_tm_rsp(struct qla_tgt_mgmt_cmd *); extern void qlt_free_mcmd(struct qla_tgt_mgmt_cmd *); extern void qlt_free_cmd(struct qla_tgt_cmd *cmd); -- cgit v1.2.3 From f9b6721a9cef94908467abf7a2cacbd15a7d23cb Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 16 Sep 2014 16:23:18 -0400 Subject: qla_target: improve qlt_unmap_sg() Remove the inline attribute. Modern compilers ignore it and the function has grown beyond where inline made sense anyway. Remove the BUG_ON(!cmd->sg_mapped), and instead return if sg_mapped is not set. Every caller is doing this check, so we might as well have it in one place instead of four. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_target.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_target.c b/drivers/scsi/qla2xxx/qla_target.c index c22b1c141fec..68c90ad441f4 100644 --- a/drivers/scsi/qla2xxx/qla_target.c +++ b/drivers/scsi/qla2xxx/qla_target.c @@ -1409,12 +1409,13 @@ out_err: return -1; } -static inline void qlt_unmap_sg(struct scsi_qla_host *vha, - struct qla_tgt_cmd *cmd) +static void qlt_unmap_sg(struct scsi_qla_host *vha, struct qla_tgt_cmd *cmd) { struct qla_hw_data *ha = vha->hw; - BUG_ON(!cmd->sg_mapped); + if (!cmd->sg_mapped) + return; + pci_unmap_sg(ha->pdev, cmd->sg, cmd->sg_cnt, cmd->dma_data_direction); cmd->sg_mapped = 0; @@ -2403,8 +2404,7 @@ int qlt_xmit_response(struct qla_tgt_cmd *cmd, int xmit_type, return 0; out_unmap_unlock: - if (cmd->sg_mapped) - qlt_unmap_sg(vha, cmd); + qlt_unmap_sg(vha, cmd); spin_unlock_irqrestore(&ha->hardware_lock, flags); return res; @@ -2468,8 +2468,7 @@ int qlt_rdy_to_xfer(struct qla_tgt_cmd *cmd) return res; out_unlock_free_unmap: - if (cmd->sg_mapped) - qlt_unmap_sg(vha, cmd); + qlt_unmap_sg(vha, cmd); spin_unlock_irqrestore(&ha->hardware_lock, flags); return res; @@ -2706,8 +2705,7 @@ done: if (!ha_locked && !in_interrupt()) msleep(250); /* just in case */ - if (cmd->sg_mapped) - qlt_unmap_sg(vha, cmd); + qlt_unmap_sg(vha, cmd); vha->hw->tgt.tgt_ops->free_cmd(cmd); } return; @@ -2927,8 +2925,7 @@ static void qlt_do_ctio_completion(struct scsi_qla_host *vha, uint32_t handle, se_cmd = &cmd->se_cmd; tfo = se_cmd->se_tfo; - if (cmd->sg_mapped) - qlt_unmap_sg(vha, cmd); + qlt_unmap_sg(vha, cmd); if (unlikely(status != CTIO_SUCCESS)) { switch (status & 0xFFFF) { -- cgit v1.2.3 From db3a99b9921f27fe71ca8c0f218ee810e0e7fb69 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 16 Sep 2014 16:23:19 -0400 Subject: qla_target: rearrange struct qla_tgt_prm On most (non-x86) 64bit platforms this will remove 8 padding bytes from the structure. Signed-off-by: Joern Engel Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/qla_target.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_target.h b/drivers/scsi/qla2xxx/qla_target.h index bc1aee78d934..c93eeab0cfb2 100644 --- a/drivers/scsi/qla2xxx/qla_target.h +++ b/drivers/scsi/qla2xxx/qla_target.h @@ -971,11 +971,11 @@ struct qla_tgt_prm { struct qla_tgt *tgt; void *pkt; struct scatterlist *sg; /* cmd data buffer SG vector */ + unsigned char *sense_buffer; int seg_cnt; int req_cnt; uint16_t rq_result; uint16_t scsi_status; - unsigned char *sense_buffer; int sense_buffer_len; int residual; int add_status_pkt; -- cgit v1.2.3 From 082f58ac4a48d3f5cb4597232cb2ac6823a96f43 Mon Sep 17 00:00:00 2001 From: Quinn Tran Date: Thu, 25 Sep 2014 06:22:28 -0400 Subject: target: Fix queue full status NULL pointer for SCF_TRANSPORT_TASK_SENSE During temporary resource starvation at lower transport layer, command is placed on queue full retry path, which expose this problem. The TCM queue full handling of SCF_TRANSPORT_TASK_SENSE currently sends the same cmd twice to lower layer. The 1st time led to cmd normal free path. The 2nd time cause Null pointer access. This regression bug was originally introduced v3.1-rc code in the following commit: commit e057f53308a5f071556ee80586b99ee755bf07f5 Author: Christoph Hellwig Date: Mon Oct 17 13:56:41 2011 -0400 target: remove the transport_qf_callback se_cmd callback Signed-off-by: Quinn Tran Signed-off-by: Saurav Kashyap Cc: # v3.1+ Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 115632ee3ec8..9700ea125268 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -1871,8 +1871,7 @@ static void transport_complete_qf(struct se_cmd *cmd) if (cmd->se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) { trace_target_cmd_complete(cmd); ret = cmd->se_tfo->queue_status(cmd); - if (ret) - goto out; + goto out; } switch (cmd->data_direction) { -- cgit v1.2.3 From 90a8020278c1598fafd071736a0846b38510309c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Oct 2014 21:49:18 -0400 Subject: vfs: fix data corruption when blocksize < pagesize for mmaped data ->page_mkwrite() is used by filesystems to allocate blocks under a page which is becoming writeably mmapped in some process' address space. This allows a filesystem to return a page fault if there is not enough space available, user exceeds quota or similar problem happens, rather than silently discarding data later when writepage is called. However VFS fails to call ->page_mkwrite() in all the cases where filesystems need it when blocksize < pagesize. For example when blocksize = 1024, pagesize = 4096 the following is problematic: ftruncate(fd, 0); pwrite(fd, buf, 1024, 0); map = mmap(NULL, 1024, PROT_WRITE, MAP_SHARED, fd, 0); map[0] = 'a'; ----> page_mkwrite() for index 0 is called ftruncate(fd, 10000); /* or even pwrite(fd, buf, 1, 10000) */ mremap(map, 1024, 10000, 0); map[4095] = 'a'; ----> no page_mkwrite() called At the moment ->page_mkwrite() is called, filesystem can allocate only one block for the page because i_size == 1024. Otherwise it would create blocks beyond i_size which is generally undesirable. But later at ->writepage() time, we also need to store data at offset 4095 but we don't have block allocated for it. This patch introduces a helper function filesystems can use to have ->page_mkwrite() called at all the necessary moments. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/buffer.c | 3 +++ include/linux/mm.h | 1 + mm/truncate.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index 9a6029e0dd71..6dc1475dcb2d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2087,6 +2087,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2106,6 +2107,8 @@ int generic_write_end(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock diff --git a/include/linux/mm.h b/include/linux/mm.h index 8981cc882ed2..5005464fe012 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1155,6 +1155,7 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping, extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); diff --git a/mm/truncate.c b/mm/truncate.c index 96d167372d89..261eaf6e5a19 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -20,6 +20,7 @@ #include /* grr. try_to_release_page, do_invalidatepage */ #include +#include #include "internal.h" static void clear_exceptional_entry(struct address_space *mapping, @@ -719,11 +720,67 @@ EXPORT_SYMBOL(truncate_pagecache); */ void truncate_setsize(struct inode *inode, loff_t newsize) { + loff_t oldsize = inode->i_size; + i_size_write(inode, newsize); + if (newsize > oldsize) + pagecache_isize_extended(inode, oldsize, newsize); truncate_pagecache(inode, newsize); } EXPORT_SYMBOL(truncate_setsize); +/** + * pagecache_isize_extended - update pagecache after extension of i_size + * @inode: inode for which i_size was extended + * @from: original inode size + * @to: new inode size + * + * Handle extension of inode size either caused by extending truncate or by + * write starting after current i_size. We mark the page straddling current + * i_size RO so that page_mkwrite() is called on the nearest write access to + * the page. This way filesystem can be sure that page_mkwrite() is called on + * the page before user writes to the page via mmap after the i_size has been + * changed. + * + * The function must be called after i_size is updated so that page fault + * coming after we unlock the page will already see the new i_size. + * The function must be called while we still hold i_mutex - this not only + * makes sure i_size is stable but also that userspace cannot observe new + * i_size value before we are prepared to store mmap writes at new inode size. + */ +void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to) +{ + int bsize = 1 << inode->i_blkbits; + loff_t rounded_from; + struct page *page; + pgoff_t index; + + WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(to > inode->i_size); + + if (from >= to || bsize == PAGE_CACHE_SIZE) + return; + /* Page straddling @from will not have any hole block created? */ + rounded_from = round_up(from, bsize); + if (to <= rounded_from || !(rounded_from & (PAGE_CACHE_SIZE - 1))) + return; + + index = from >> PAGE_CACHE_SHIFT; + page = find_lock_page(inode->i_mapping, index); + /* Page not cached? Nothing to do */ + if (!page) + return; + /* + * See clear_page_dirty_for_io() for details why set_page_dirty() + * is needed. + */ + if (page_mkclean(page)) + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); +} +EXPORT_SYMBOL(pagecache_isize_extended); + /** * truncate_pagecache_range - unmap and remove pagecache that is hole-punched * @inode: inode -- cgit v1.2.3 From d6320cbfc92910a3e5f10c42d98c231c98db4f60 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 1 Oct 2014 21:49:46 -0400 Subject: ext4: fix mmap data corruption when blocksize < pagesize Use truncate_isize_extended() when hole is being created in a file so that ->page_mkwrite() will get called for the partial tail page if it is mmaped (see the first patch in the series for details). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d5dd7d46844e..091845298f48 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4514,8 +4514,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_orphan_del(NULL, inode); goto err_out; } - } else + } else { + loff_t oldsize = inode->i_size; + i_size_write(inode, attr->ia_size); + pagecache_isize_extended(inode, oldsize, inode->i_size); + } /* * Blocks are going to be removed from the inode. Wait -- cgit v1.2.3 From bce92d566a57893e98ec83e4e5447f860d2889b7 Mon Sep 17 00:00:00 2001 From: Li Xi Date: Wed, 1 Oct 2014 22:11:06 -0400 Subject: ext4: fix return value of ext4_do_update_inode When ext4_do_update_inode() gets error from ext4_inode_blocks_set(), error number should be returned. Signed-off-by: Li Xi Signed-off-by: Theodore Ts'o Reviewed-by: Eric Sandeen Reviewed-by: Jan Kara --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 091845298f48..41c4f97c39d3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4204,7 +4204,8 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - if (ext4_inode_blocks_set(handle, raw_inode, ei)) { + err = ext4_inode_blocks_set(handle, raw_inode, ei); + if (err) { spin_unlock(&ei->i_raw_lock); goto out_brelse; } -- cgit v1.2.3 From c5d311926da483951bd5da637ed65de8614d1901 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 1 Oct 2014 22:23:15 -0400 Subject: ext4: fix over-defensive complaint after journal abort Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 0074e0d23d6e..3445035c7e01 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -256,8 +256,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - /* Errors can only happen if there is a bug */ - if (WARN_ON_ONCE(err)) { + /* Errors can only happen due to aborted journal or a nasty bug */ + if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); if (inode == NULL) { -- cgit v1.2.3 From dfe076c106f63cf6bcd375c56db9c8c89a088dab Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 1 Oct 2014 22:26:17 -0400 Subject: ext4: get rid of code duplication Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 65cca2881d71..eab825f61bf9 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3155,9 +3155,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, "start %lu, size %lu, fe_logical %lu", (unsigned long) start, (unsigned long) size, (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); } - BUG_ON(start + size <= ac->ac_o_ex.fe_logical && - start > ac->ac_o_ex.fe_logical); BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ -- cgit v1.2.3 From be5cd90ddaf471e676fad6ced29e69e8610c5d20 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 1 Oct 2014 22:57:09 -0400 Subject: ext4: optimize block allocation on grow indepth It is reasonable to prepend newly created index to older one. [ Dropped no longer used function parameter newext. -tytso ] Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 8170b3254767..c3ed9af20d20 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1263,16 +1263,24 @@ cleanup: * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_extent *newext) + unsigned int flags) { struct ext4_extent_header *neh; struct buffer_head *bh; - ext4_fsblk_t newblock; + ext4_fsblk_t newblock, goal = 0; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, NULL, - newext, &err, flags); + /* Try to prepend new index to old one */ + if (ext_depth(inode)) + goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); + if (goal > le32_to_cpu(es->s_first_data_block)) { + flags |= EXT4_MB_HINT_TRY_GOAL; + goal--; + } else + goal = ext4_inode_to_goal_block(inode); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, &err); if (newblock == 0) return err; @@ -1373,7 +1381,7 @@ repeat: err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); + err = ext4_ext_grow_indepth(handle, inode, mb_flags); if (err) goto out; -- cgit v1.2.3 From 20959c4b4078847e629eed8918abb52bfe5f559a Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Oct 2014 16:07:02 -0700 Subject: target: Remove unneeded check in sbc_parse_cdb The check of SCF_SCSI_DATA_CDB seems to be a remnant from before hch's refactoring of this function. There are no places where that flag is set that cmd->execute_cmd isn't also set. Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_sbc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/target_core_sbc.c b/drivers/target/target_core_sbc.c index bd78d9235ac6..ebe62afb957d 100644 --- a/drivers/target/target_core_sbc.c +++ b/drivers/target/target_core_sbc.c @@ -948,7 +948,7 @@ sbc_parse_cdb(struct se_cmd *cmd, struct sbc_ops *ops) } /* reject any command that we don't have a handler for */ - if (!(cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) && !cmd->execute_cmd) + if (!cmd->execute_cmd) return TCM_UNSUPPORTED_SCSI_OPCODE; if (cmd->se_cmd_flags & SCF_SCSI_DATA_CDB) { -- cgit v1.2.3 From f14bb039a4e8206439d3e9abd92bc76bd142f243 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Oct 2014 16:07:03 -0700 Subject: uio: Export definition of struct uio_device In order to prevent a O(n) search of the filesystem to link up its uio node with its target configuration, TCMU needs to know the minor number that UIO assigned. Expose the definition of this struct so TCMU can access this field. Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/uio/uio.c | 12 ------------ include/linux/uio_driver.h | 12 +++++++++++- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index a673e5b6a2e0..60fa6278fbce 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -28,18 +28,6 @@ #define UIO_MAX_DEVICES (1U << MINORBITS) -struct uio_device { - struct module *owner; - struct device *dev; - int minor; - atomic_t event; - struct fasync_struct *async_queue; - wait_queue_head_t wait; - struct uio_info *info; - struct kobject *map_dir; - struct kobject *portio_dir; -}; - static int uio_major; static struct cdev *uio_cdev; static DEFINE_IDR(uio_idr); diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 1ad4724458de..baa81718d985 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -63,7 +63,17 @@ struct uio_port { #define MAX_UIO_PORT_REGIONS 5 -struct uio_device; +struct uio_device { + struct module *owner; + struct device *dev; + int minor; + atomic_t event; + struct fasync_struct *async_queue; + wait_queue_head_t wait; + struct uio_info *info; + struct kobject *map_dir; + struct kobject *portio_dir; +}; /** * struct uio_info - UIO device capabilities -- cgit v1.2.3 From ce87685128f3e0fced2aca9f73fc8cc67704ae11 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Oct 2014 16:07:04 -0700 Subject: target: Add documentation on the target userspace pass-through driver Describes the driver and its interface to make it possible for user programs to back a LIO-exported LUN. Thanks to Richard W. M. Jones for review, and supplementing this doc with the first two paragraphs. Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- Documentation/target/tcmu-design.txt | 378 +++++++++++++++++++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100644 Documentation/target/tcmu-design.txt diff --git a/Documentation/target/tcmu-design.txt b/Documentation/target/tcmu-design.txt new file mode 100644 index 000000000000..5518465290bf --- /dev/null +++ b/Documentation/target/tcmu-design.txt @@ -0,0 +1,378 @@ +Contents: + +1) TCM Userspace Design + a) Background + b) Benefits + c) Design constraints + d) Implementation overview + i. Mailbox + ii. Command ring + iii. Data Area + e) Device discovery + f) Device events + g) Other contingencies +2) Writing a user pass-through handler + a) Discovering and configuring TCMU uio devices + b) Waiting for events on the device(s) + c) Managing the command ring +3) Command filtering and pass_level +4) A final note + + +TCM Userspace Design +-------------------- + +TCM is another name for LIO, an in-kernel iSCSI target (server). +Existing TCM targets run in the kernel. TCMU (TCM in Userspace) +allows userspace programs to be written which act as iSCSI targets. +This document describes the design. + +The existing kernel provides modules for different SCSI transport +protocols. TCM also modularizes the data storage. There are existing +modules for file, block device, RAM or using another SCSI device as +storage. These are called "backstores" or "storage engines". These +built-in modules are implemented entirely as kernel code. + +Background: + +In addition to modularizing the transport protocol used for carrying +SCSI commands ("fabrics"), the Linux kernel target, LIO, also modularizes +the actual data storage as well. These are referred to as "backstores" +or "storage engines". The target comes with backstores that allow a +file, a block device, RAM, or another SCSI device to be used for the +local storage needed for the exported SCSI LUN. Like the rest of LIO, +these are implemented entirely as kernel code. + +These backstores cover the most common use cases, but not all. One new +use case that other non-kernel target solutions, such as tgt, are able +to support is using Gluster's GLFS or Ceph's RBD as a backstore. The +target then serves as a translator, allowing initiators to store data +in these non-traditional networked storage systems, while still only +using standard protocols themselves. + +If the target is a userspace process, supporting these is easy. tgt, +for example, needs only a small adapter module for each, because the +modules just use the available userspace libraries for RBD and GLFS. + +Adding support for these backstores in LIO is considerably more +difficult, because LIO is entirely kernel code. Instead of undertaking +the significant work to port the GLFS or RBD APIs and protocols to the +kernel, another approach is to create a userspace pass-through +backstore for LIO, "TCMU". + + +Benefits: + +In addition to allowing relatively easy support for RBD and GLFS, TCMU +will also allow easier development of new backstores. TCMU combines +with the LIO loopback fabric to become something similar to FUSE +(Filesystem in Userspace), but at the SCSI layer instead of the +filesystem layer. A SUSE, if you will. + +The disadvantage is there are more distinct components to configure, and +potentially to malfunction. This is unavoidable, but hopefully not +fatal if we're careful to keep things as simple as possible. + +Design constraints: + +- Good performance: high throughput, low latency +- Cleanly handle if userspace: + 1) never attaches + 2) hangs + 3) dies + 4) misbehaves +- Allow future flexibility in user & kernel implementations +- Be reasonably memory-efficient +- Simple to configure & run +- Simple to write a userspace backend + + +Implementation overview: + +The core of the TCMU interface is a memory region that is shared +between kernel and userspace. Within this region is: a control area +(mailbox); a lockless producer/consumer circular buffer for commands +to be passed up, and status returned; and an in/out data buffer area. + +TCMU uses the pre-existing UIO subsystem. UIO allows device driver +development in userspace, and this is conceptually very close to the +TCMU use case, except instead of a physical device, TCMU implements a +memory-mapped layout designed for SCSI commands. Using UIO also +benefits TCMU by handling device introspection (e.g. a way for +userspace to determine how large the shared region is) and signaling +mechanisms in both directions. + +There are no embedded pointers in the memory region. Everything is +expressed as an offset from the region's starting address. This allows +the ring to still work if the user process dies and is restarted with +the region mapped at a different virtual address. + +See target_core_user.h for the struct definitions. + +The Mailbox: + +The mailbox is always at the start of the shared memory region, and +contains a version, details about the starting offset and size of the +command ring, and head and tail pointers to be used by the kernel and +userspace (respectively) to put commands on the ring, and indicate +when the commands are completed. + +version - 1 (userspace should abort if otherwise) +flags - none yet defined. +cmdr_off - The offset of the start of the command ring from the start +of the memory region, to account for the mailbox size. +cmdr_size - The size of the command ring. This does *not* need to be a +power of two. +cmd_head - Modified by the kernel to indicate when a command has been +placed on the ring. +cmd_tail - Modified by userspace to indicate when it has completed +processing of a command. + +The Command Ring: + +Commands are placed on the ring by the kernel incrementing +mailbox.cmd_head by the size of the command, modulo cmdr_size, and +then signaling userspace via uio_event_notify(). Once the command is +completed, userspace updates mailbox.cmd_tail in the same way and +signals the kernel via a 4-byte write(). When cmd_head equals +cmd_tail, the ring is empty -- no commands are currently waiting to be +processed by userspace. + +TCMU commands start with a common header containing "len_op", a 32-bit +value that stores the length, as well as the opcode in the lowest +unused bits. Currently only two opcodes are defined, TCMU_OP_PAD and +TCMU_OP_CMD. When userspace encounters a command with PAD opcode, it +should skip ahead by the bytes in "length". (The kernel inserts PAD +entries to ensure each CMD entry fits contigously into the circular +buffer.) + +When userspace handles a CMD, it finds the SCSI CDB (Command Data +Block) via tcmu_cmd_entry.req.cdb_off. This is an offset from the +start of the overall shared memory region, not the entry. The data +in/out buffers are accessible via tht req.iov[] array. Note that +each iov.iov_base is also an offset from the start of the region. + +TCMU currently does not support BIDI operations. + +When completing a command, userspace sets rsp.scsi_status, and +rsp.sense_buffer if necessary. Userspace then increments +mailbox.cmd_tail by entry.hdr.length (mod cmdr_size) and signals the +kernel via the UIO method, a 4-byte write to the file descriptor. + +The Data Area: + +This is shared-memory space after the command ring. The organization +of this area is not defined in the TCMU interface, and userspace +should access only the parts referenced by pending iovs. + + +Device Discovery: + +Other devices may be using UIO besides TCMU. Unrelated user processes +may also be handling different sets of TCMU devices. TCMU userspace +processes must find their devices by scanning sysfs +class/uio/uio*/name. For TCMU devices, these names will be of the +format: + +tcm-user//// + +where "tcm-user" is common for all TCMU-backed UIO devices. +and allow userspace to find the device's path in the +kernel target's configfs tree. Assuming the usual mount point, it is +found at: + +/sys/kernel/config/target/core/user_/ + +This location contains attributes such as "hw_block_size", that +userspace needs to know for correct operation. + + will be a userspace-process-unique string to identify the +TCMU device as expecting to be backed by a certain handler, and +will be an additional handler-specific string for the user process to +configure the device, if needed. The name cannot contain ':', due to +LIO limitations. + +For all devices so discovered, the user handler opens /dev/uioX and +calls mmap(): + +mmap(NULL, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0) + +where size must be equal to the value read from +/sys/class/uio/uioX/maps/map0/size. + + +Device Events: + +If a new device is added or removed, a notification will be broadcast +over netlink, using a generic netlink family name of "TCM-USER" and a +multicast group named "config". This will include the UIO name as +described in the previous section, as well as the UIO minor +number. This should allow userspace to identify both the UIO device and +the LIO device, so that after determining the device is supported +(based on subtype) it can take the appropriate action. + + +Other contingencies: + +Userspace handler process never attaches: + +- TCMU will post commands, and then abort them after a timeout period + (30 seconds.) + +Userspace handler process is killed: + +- It is still possible to restart and re-connect to TCMU + devices. Command ring is preserved. However, after the timeout period, + the kernel will abort pending tasks. + +Userspace handler process hangs: + +- The kernel will abort pending tasks after a timeout period. + +Userspace handler process is malicious: + +- The process can trivially break the handling of devices it controls, + but should not be able to access kernel memory outside its shared + memory areas. + + +Writing a user pass-through handler (with example code) +------------------------------------------------------- + +A user process handing a TCMU device must support the following: + +a) Discovering and configuring TCMU uio devices +b) Waiting for events on the device(s) +c) Managing the command ring: Parsing operations and commands, + performing work as needed, setting response fields (scsi_status and + possibly sense_buffer), updating cmd_tail, and notifying the kernel + that work has been finished + +First, consider instead writing a plugin for tcmu-runner. tcmu-runner +implements all of this, and provides a higher-level API for plugin +authors. + +TCMU is designed so that multiple unrelated processes can manage TCMU +devices separately. All handlers should make sure to only open their +devices, based opon a known subtype string. + +a) Discovering and configuring TCMU UIO devices: + +(error checking omitted for brevity) + +int fd, dev_fd; +char buf[256]; +unsigned long long map_len; +void *map; + +fd = open("/sys/class/uio/uio0/name", O_RDONLY); +ret = read(fd, buf, sizeof(buf)); +close(fd); +buf[ret-1] = '\0'; /* null-terminate and chop off the \n */ + +/* we only want uio devices whose name is a format we expect */ +if (strncmp(buf, "tcm-user", 8)) + exit(-1); + +/* Further checking for subtype also needed here */ + +fd = open(/sys/class/uio/%s/maps/map0/size, O_RDONLY); +ret = read(fd, buf, sizeof(buf)); +close(fd); +str_buf[ret-1] = '\0'; /* null-terminate and chop off the \n */ + +map_len = strtoull(buf, NULL, 0); + +dev_fd = open("/dev/uio0", O_RDWR); +map = mmap(NULL, map_len, PROT_READ|PROT_WRITE, MAP_SHARED, dev_fd, 0); + + +b) Waiting for events on the device(s) + +while (1) { + char buf[4]; + + int ret = read(dev_fd, buf, 4); /* will block */ + + handle_device_events(dev_fd, map); +} + + +c) Managing the command ring + +#include + +int handle_device_events(int fd, void *map) +{ + struct tcmu_mailbox *mb = map; + struct tcmu_cmd_entry *ent = (void *) mb + mb->cmdr_off + mb->cmd_tail; + int did_some_work = 0; + + /* Process events from cmd ring until we catch up with cmd_head */ + while (ent != (void *)mb + mb->cmdr_off + mb->cmd_head) { + + if (tcmu_hdr_get_op(&ent->hdr) == TCMU_OP_CMD) { + uint8_t *cdb = (void *)mb + ent->req.cdb_off; + bool success = true; + + /* Handle command here. */ + printf("SCSI opcode: 0x%x\n", cdb[0]); + + /* Set response fields */ + if (success) + ent->rsp.scsi_status = SCSI_NO_SENSE; + else { + /* Also fill in rsp->sense_buffer here */ + ent->rsp.scsi_status = SCSI_CHECK_CONDITION; + } + } + else { + /* Do nothing for PAD entries */ + } + + /* update cmd_tail */ + mb->cmd_tail = (mb->cmd_tail + tcmu_hdr_get_len(&ent->hdr)) % mb->cmdr_size; + ent = (void *) mb + mb->cmdr_off + mb->cmd_tail; + did_some_work = 1; + } + + /* Notify the kernel that work has been finished */ + if (did_some_work) { + uint32_t buf = 0; + + write(fd, &buf, 4); + } + + return 0; +} + + +Command filtering and pass_level +-------------------------------- + +TCMU supports a "pass_level" option with valid values of 0 or 1. When +the value is 0 (the default), nearly all SCSI commands received for +the device are passed through to the handler. This allows maximum +flexibility but increases the amount of code required by the handler, +to support all mandatory SCSI commands. If pass_level is set to 1, +then only IO-related commands are presented, and the rest are handled +by LIO's in-kernel command emulation. The commands presented at level +1 include all versions of: + +READ +WRITE +WRITE_VERIFY +XDWRITEREAD +WRITE_SAME +COMPARE_AND_WRITE +SYNCHRONIZE_CACHE +UNMAP + + +A final note +------------ + +Please be careful to return codes as defined by the SCSI +specifications. These are different than some values defined in the +scsi/scsi.h include file. For example, CHECK CONDITION's status code +is 2, not 1. -- cgit v1.2.3 From 3e67cfad22230ebed85c56cbe413876f33fea82b Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 3 Oct 2014 12:47:23 -0400 Subject: ext4: grab missed write_count for EXT4_IOC_SWAP_BOOT Otherwise this provokes complain like follows: WARNING: CPU: 12 PID: 5795 at fs/ext4/ext4_jbd2.c:48 ext4_journal_check_start+0x4e/0xa0() Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod CPU: 12 PID: 5795 Comm: python Not tainted 3.17.0-rc2-00175-gae5344f #158 Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011 0000000000000030 ffff8808116cfd28 ffffffff815c7dfc 0000000000000030 0000000000000000 ffff8808116cfd68 ffffffff8106ce8c ffff8808116cfdc8 ffff880813b16000 ffff880806ad6ae8 ffffffff81202008 0000000000000000 Call Trace: [] dump_stack+0x51/0x6d [] warn_slowpath_common+0x8c/0xc0 [] ? ext4_ioctl+0x9e8/0xeb0 [] warn_slowpath_null+0x1a/0x20 [] ext4_journal_check_start+0x4e/0xa0 [] __ext4_journal_start_sb+0x90/0x110 [] ext4_ioctl+0x9e8/0xeb0 [] ? ptrace_stop+0x24d/0x2f0 [] ? alloc_pid+0x480/0x480 [] ? ptrace_do_notify+0x92/0xb0 [] do_vfs_ioctl+0x4e5/0x550 [] ? _raw_spin_unlock_irq+0x2b/0x40 [] SyS_ioctl+0x53/0x80 [] tracesys+0xd0/0xd5 Reviewed-by: Jan Kara Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ioctl.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0f2252ec274d..3d5de16f028f 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -532,9 +532,17 @@ group_add_out: } case EXT4_IOC_SWAP_BOOT: + { + int err; if (!(filp->f_mode & FMODE_WRITE)) return -EBADF; - return swap_inode_boot_loader(sb, inode); + err = mnt_want_write_file(filp); + if (err) + return err; + err = swap_inode_boot_loader(sb, inode); + mnt_drop_write_file(filp); + return err; + } case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; -- cgit v1.2.3 From 161485e8273001e56b2f20755ad9b6217b601fb3 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 4 Aug 2014 18:16:00 +0200 Subject: efi: Implement mandatory locking for UEFI Runtime Services According to section 7.1 of the UEFI spec, Runtime Services are not fully reentrant, and there are particular combinations of calls that need to be serialized. Use a spinlock to serialize all Runtime Services with respect to all others, even if this is more than strictly needed. We've managed to get away without requiring a runtime services lock until now because most of the interactions with EFI involve EFI variables, and those operations are already serialised with __efivars->lock. Some of the assumptions underlying the decision whether locks are needed or not (e.g., SetVariable() against ResetSystem()) may not apply universally to all [new] architectures that implement UEFI. Rather than try to reason our way out of this, let's just implement at least what the spec requires in terms of locking. Signed-off-by: Ard Biesheuvel Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 2 + drivers/firmware/efi/runtime-wrappers.c | 154 +++++++++++++++++++++++++++++--- 2 files changed, 146 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 044a2fd3c5fe..b39ee5f2c02d 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -86,6 +86,8 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, #endif /* CONFIG_X86_32 */ +#define efi_in_nmi() in_nmi() + extern int add_efi_memmap; extern struct efi_scratch efi_scratch; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 10daa4bbb258..9694cba665c4 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -14,10 +14,82 @@ * This file is released under the GPLv2. */ +#include #include -#include /* spinlock_t */ +#include +#include #include +/* + * According to section 7.1 of the UEFI spec, Runtime Services are not fully + * reentrant, and there are particular combinations of calls that need to be + * serialized. (source: UEFI Specification v2.4A) + * + * Table 31. Rules for Reentry Into Runtime Services + * +------------------------------------+-------------------------------+ + * | If previous call is busy in | Forbidden to call | + * +------------------------------------+-------------------------------+ + * | Any | SetVirtualAddressMap() | + * +------------------------------------+-------------------------------+ + * | ConvertPointer() | ConvertPointer() | + * +------------------------------------+-------------------------------+ + * | SetVariable() | ResetSystem() | + * | UpdateCapsule() | | + * | SetTime() | | + * | SetWakeupTime() | | + * | GetNextHighMonotonicCount() | | + * +------------------------------------+-------------------------------+ + * | GetVariable() | GetVariable() | + * | GetNextVariableName() | GetNextVariableName() | + * | SetVariable() | SetVariable() | + * | QueryVariableInfo() | QueryVariableInfo() | + * | UpdateCapsule() | UpdateCapsule() | + * | QueryCapsuleCapabilities() | QueryCapsuleCapabilities() | + * | GetNextHighMonotonicCount() | GetNextHighMonotonicCount() | + * +------------------------------------+-------------------------------+ + * | GetTime() | GetTime() | + * | SetTime() | SetTime() | + * | GetWakeupTime() | GetWakeupTime() | + * | SetWakeupTime() | SetWakeupTime() | + * +------------------------------------+-------------------------------+ + * + * Due to the fact that the EFI pstore may write to the variable store in + * interrupt context, we need to use a spinlock for at least the groups that + * contain SetVariable() and QueryVariableInfo(). That leaves little else, as + * none of the remaining functions are actually ever called at runtime. + * So let's just use a single spinlock to serialize all Runtime Services calls. + */ +static DEFINE_SPINLOCK(efi_runtime_lock); + +/* + * Some runtime services calls can be reentrant under NMI, even if the table + * above says they are not. (source: UEFI Specification v2.4A) + * + * Table 32. Functions that may be called after Machine Check, INIT and NMI + * +----------------------------+------------------------------------------+ + * | Function | Called after Machine Check, INIT and NMI | + * +----------------------------+------------------------------------------+ + * | GetTime() | Yes, even if previously busy. | + * | GetVariable() | Yes, even if previously busy | + * | GetNextVariableName() | Yes, even if previously busy | + * | QueryVariableInfo() | Yes, even if previously busy | + * | SetVariable() | Yes, even if previously busy | + * | UpdateCapsule() | Yes, even if previously busy | + * | QueryCapsuleCapabilities() | Yes, even if previously busy | + * | ResetSystem() | Yes, even if previously busy | + * +----------------------------+------------------------------------------+ + * + * In order to prevent deadlocks under NMI, the wrappers for these functions + * may only grab the efi_runtime_lock or rtc_lock spinlocks if !efi_in_nmi(). + * However, not all of the services listed are reachable through NMI code paths, + * so the the special handling as suggested by the UEFI spec is only implemented + * for QueryVariableInfo() and SetVariable(), as these can be reached in NMI + * context through efi_pstore_write(). + */ +#ifndef efi_in_nmi +#define efi_in_nmi() (0) +#endif + /* * As per commit ef68c8f87ed1 ("x86: Serialize EFI time accesses on rtc_lock"), * the EFI specification requires that callers of the time related runtime @@ -32,7 +104,9 @@ static efi_status_t virt_efi_get_time(efi_time_t *tm, efi_time_cap_t *tc) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&efi_runtime_lock); status = efi_call_virt(get_time, tm, tc); + spin_unlock(&efi_runtime_lock); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -43,7 +117,9 @@ static efi_status_t virt_efi_set_time(efi_time_t *tm) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&efi_runtime_lock); status = efi_call_virt(set_time, tm); + spin_unlock(&efi_runtime_lock); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -56,7 +132,9 @@ static efi_status_t virt_efi_get_wakeup_time(efi_bool_t *enabled, efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&efi_runtime_lock); status = efi_call_virt(get_wakeup_time, enabled, pending, tm); + spin_unlock(&efi_runtime_lock); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -67,7 +145,9 @@ static efi_status_t virt_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) efi_status_t status; spin_lock_irqsave(&rtc_lock, flags); + spin_lock(&efi_runtime_lock); status = efi_call_virt(set_wakeup_time, enabled, tm); + spin_unlock(&efi_runtime_lock); spin_unlock_irqrestore(&rtc_lock, flags); return status; } @@ -78,14 +158,27 @@ static efi_status_t virt_efi_get_variable(efi_char16_t *name, unsigned long *data_size, void *data) { - return efi_call_virt(get_variable, name, vendor, attr, data_size, data); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(get_variable, name, vendor, attr, data_size, + data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static efi_status_t virt_efi_get_next_variable(unsigned long *name_size, efi_char16_t *name, efi_guid_t *vendor) { - return efi_call_virt(get_next_variable, name_size, name, vendor); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(get_next_variable, name_size, name, vendor); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static efi_status_t virt_efi_set_variable(efi_char16_t *name, @@ -94,7 +187,17 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, unsigned long data_size, void *data) { - return efi_call_virt(set_variable, name, vendor, attr, data_size, data); + unsigned long flags; + efi_status_t status; + bool __in_nmi = efi_in_nmi(); + + if (!__in_nmi) + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(set_variable, name, vendor, attr, data_size, + data); + if (!__in_nmi) + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static efi_status_t virt_efi_query_variable_info(u32 attr, @@ -102,16 +205,31 @@ static efi_status_t virt_efi_query_variable_info(u32 attr, u64 *remaining_space, u64 *max_variable_size) { + unsigned long flags; + efi_status_t status; + bool __in_nmi = efi_in_nmi(); + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt(query_variable_info, attr, storage_space, - remaining_space, max_variable_size); + if (!__in_nmi) + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(query_variable_info, attr, storage_space, + remaining_space, max_variable_size); + if (!__in_nmi) + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) { - return efi_call_virt(get_next_high_mono_count, count); + unsigned long flags; + efi_status_t status; + + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(get_next_high_mono_count, count); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static void virt_efi_reset_system(int reset_type, @@ -119,17 +237,27 @@ static void virt_efi_reset_system(int reset_type, unsigned long data_size, efi_char16_t *data) { + unsigned long flags; + + spin_lock_irqsave(&efi_runtime_lock, flags); __efi_call_virt(reset_system, reset_type, status, data_size, data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); } static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, unsigned long count, unsigned long sg_list) { + unsigned long flags; + efi_status_t status; + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt(update_capsule, capsules, count, sg_list); + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(update_capsule, capsules, count, sg_list); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, @@ -137,11 +265,17 @@ static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, u64 *max_size, int *reset_type) { + unsigned long flags; + efi_status_t status; + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - return efi_call_virt(query_capsule_caps, capsules, count, max_size, - reset_type); + spin_lock_irqsave(&efi_runtime_lock, flags); + status = efi_call_virt(query_capsule_caps, capsules, count, max_size, + reset_type); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; } void efi_native_runtime_setup(void) -- cgit v1.2.3 From 5a17dae422d7de4b776a9753cd4673a343a25b4b Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 5 Aug 2014 11:52:11 +0100 Subject: efi: Add efi= parameter parsing to the EFI boot stub We need a way to customize the behaviour of the EFI boot stub, in particular, we need a way to disable the "chunking" workaround, used when reading files from the EFI System Partition. One of my machines doesn't cope well when reading files in 1MB chunks to a buffer above the 4GB mark - it appears that the "chunking" bug workaround triggers another firmware bug. This was only discovered with commit 4bf7111f5016 ("x86/efi: Support initrd loaded above 4G"), and that commit is perfectly valid. The symptom I observed was a corrupt initrd rather than any kind of crash. efi= is now used to specify EFI parameters in two very different execution environments, the EFI boot stub and during kernel boot. There is also a slight performance optimization by enabling efi=nochunk, but that's offset by the fact that you're more likely to run into firmware issues, at least on x86. This is the rationale behind leaving the workaround enabled by default. Also provide some documentation for EFI_READ_CHUNK_SIZE and why we're using the current value of 1MB. Tested-by: Ard Biesheuvel Cc: Roy Franz Cc: Maarten Lankhorst Cc: Leif Lindholm Cc: Borislav Petkov Signed-off-by: Matt Fleming --- Documentation/kernel-parameters.txt | 5 ++- arch/x86/boot/compressed/eboot.c | 4 ++ arch/x86/platform/efi/efi.c | 19 +++++++- drivers/firmware/efi/libstub/arm-stub.c | 4 ++ drivers/firmware/efi/libstub/efi-stub-helper.c | 62 +++++++++++++++++++++++++- include/linux/efi.h | 2 + 6 files changed, 91 insertions(+), 5 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 5ae8608ca9f5..67d79597d9d6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -992,10 +992,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: {"off" | "on" | "skip[mbr]"} efi= [EFI] - Format: { "old_map" } + Format: { "old_map", "nochunk" } old_map [X86-64]: switch to the old ioremap-based EFI runtime services mapping. 32-bit still uses this one by default. + nochunk: disable reading files in "chunks" in the EFI + boot stub, as chunking can cause problems with some + firmware implementations. efi_no_storage_paranoia [EFI; X86] Using this parameter you can use more than 50% of diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index f277184e2ac1..f4bdab1dbf66 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -1100,6 +1100,10 @@ struct boot_params *make_boot_params(struct efi_config *c) else initrd_addr_max = hdr->initrd_addr_max; + status = efi_parse_options(cmdline_ptr); + if (status != EFI_SUCCESS) + goto fail2; + status = handle_cmdline_files(sys_table, image, (char *)(unsigned long)hdr->cmd_line_ptr, "initrd=", initrd_addr_max, diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 850da94fef30..a1f745b0bf1d 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -943,8 +943,23 @@ static int __init parse_efi_cmdline(char *str) if (*str == '=') str++; - if (!strncmp(str, "old_map", 7)) - set_bit(EFI_OLD_MEMMAP, &efi.flags); + while (*str) { + if (!strncmp(str, "old_map", 7)) { + set_bit(EFI_OLD_MEMMAP, &efi.flags); + str += strlen("old_map"); + } + + /* + * Skip any options we don't understand. Presumably + * they apply to the EFI boot stub. + */ + while (*str && *str != ',') + str++; + + /* If we hit a delimiter, skip it */ + if (*str == ',') + str++; + } return 0; } diff --git a/drivers/firmware/efi/libstub/arm-stub.c b/drivers/firmware/efi/libstub/arm-stub.c index 480339b6b110..75ee05964cbc 100644 --- a/drivers/firmware/efi/libstub/arm-stub.c +++ b/drivers/firmware/efi/libstub/arm-stub.c @@ -226,6 +226,10 @@ unsigned long __init efi_entry(void *handle, efi_system_table_t *sys_table, goto fail_free_image; } + status = efi_parse_options(cmdline_ptr); + if (status != EFI_SUCCESS) + pr_efi_err(sys_table, "Failed to parse EFI cmdline options\n"); + /* * Unauthenticated device tree data is a security hazard, so * ignore 'dtb=' unless UEFI Secure Boot is disabled. diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 32d5cca30f49..a920fec8fe88 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -15,8 +15,23 @@ #include "efistub.h" +/* + * Some firmware implementations have problems reading files in one go. + * A read chunk size of 1MB seems to work for most platforms. + * + * Unfortunately, reading files in chunks triggers *other* bugs on some + * platforms, so we provide a way to disable this workaround, which can + * be done by passing "efi=nochunk" on the EFI boot stub command line. + * + * If you experience issues with initrd images being corrupt it's worth + * trying efi=nochunk, but chunking is enabled by default because there + * are far more machines that require the workaround than those that + * break with it enabled. + */ #define EFI_READ_CHUNK_SIZE (1024 * 1024) +static unsigned long __chunk_size = EFI_READ_CHUNK_SIZE; + struct file_info { efi_file_handle_t *handle; u64 size; @@ -281,6 +296,49 @@ void efi_free(efi_system_table_t *sys_table_arg, unsigned long size, efi_call_early(free_pages, addr, nr_pages); } +/* + * Parse the ASCII string 'cmdline' for EFI options, denoted by the efi= + * option, e.g. efi=nochunk. + * + * It should be noted that efi= is parsed in two very different + * environments, first in the early boot environment of the EFI boot + * stub, and subsequently during the kernel boot. + */ +efi_status_t efi_parse_options(char *cmdline) +{ + char *str; + + /* + * If no EFI parameters were specified on the cmdline we've got + * nothing to do. + */ + str = strstr(cmdline, "efi="); + if (!str) + return EFI_SUCCESS; + + /* Skip ahead to first argument */ + str += strlen("efi="); + + /* + * Remember, because efi= is also used by the kernel we need to + * skip over arguments we don't understand. + */ + while (*str) { + if (!strncmp(str, "nochunk", 7)) { + str += strlen("nochunk"); + __chunk_size = -1UL; + } + + /* Group words together, delimited by "," */ + while (*str && *str != ',') + str++; + + if (*str == ',') + str++; + } + + return EFI_SUCCESS; +} /* * Check the cmdline for a LILO-style file= arguments. @@ -423,8 +481,8 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, size = files[j].size; while (size) { unsigned long chunksize; - if (size > EFI_READ_CHUNK_SIZE) - chunksize = EFI_READ_CHUNK_SIZE; + if (size > __chunk_size) + chunksize = __chunk_size; else chunksize = size; diff --git a/include/linux/efi.h b/include/linux/efi.h index 45cb4ffdea62..518779fb5e90 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1227,4 +1227,6 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, unsigned long *load_addr, unsigned long *load_size); +efi_status_t efi_parse_options(char *cmdline); + #endif /* _LINUX_EFI_H */ -- cgit v1.2.3 From 1282278ee00b41f314d1bce058a6b28b1bd49c21 Mon Sep 17 00:00:00 2001 From: Josh Triplett Date: Wed, 30 Jul 2014 12:23:32 -0700 Subject: efi-bgrt: Add error handling; inform the user when ignoring the BGRT Gracefully handle failures to allocate memory for the image, which might be arbitrarily large. efi_bgrt_init can fail in various ways as well, usually because the BIOS-provided BGRT structure does not match expectations. Add appropriate error messages rather than failing silently. Reported-by: Srihari Vijayaraghavan Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=81321 Signed-off-by: Josh Triplett Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi-bgrt.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index f15103dff4b4..d143d216d52b 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -40,20 +40,40 @@ void __init efi_bgrt_init(void) if (ACPI_FAILURE(status)) return; - if (bgrt_tab->header.length < sizeof(*bgrt_tab)) + if (bgrt_tab->header.length < sizeof(*bgrt_tab)) { + pr_err("Ignoring BGRT: invalid length %u (expected %zu)\n", + bgrt_tab->header.length, sizeof(*bgrt_tab)); return; - if (bgrt_tab->version != 1 || bgrt_tab->status != 1) + } + if (bgrt_tab->version != 1) { + pr_err("Ignoring BGRT: invalid version %u (expected 1)\n", + bgrt_tab->version); + return; + } + if (bgrt_tab->status != 1) { + pr_err("Ignoring BGRT: invalid status %u (expected 1)\n", + bgrt_tab->status); + return; + } + if (bgrt_tab->image_type != 0) { + pr_err("Ignoring BGRT: invalid image type %u (expected 0)\n", + bgrt_tab->image_type); return; - if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address) + } + if (!bgrt_tab->image_address) { + pr_err("Ignoring BGRT: null image address\n"); return; + } image = efi_lookup_mapped_addr(bgrt_tab->image_address); if (!image) { image = early_memremap(bgrt_tab->image_address, sizeof(bmp_header)); ioremapped = true; - if (!image) + if (!image) { + pr_err("Ignoring BGRT: failed to map image header memory\n"); return; + } } memcpy_fromio(&bmp_header, image, sizeof(bmp_header)); @@ -61,14 +81,18 @@ void __init efi_bgrt_init(void) early_iounmap(image, sizeof(bmp_header)); bgrt_image_size = bmp_header.size; - bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL); - if (!bgrt_image) + bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN); + if (!bgrt_image) { + pr_err("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n", + bgrt_image_size); return; + } if (ioremapped) { image = early_memremap(bgrt_tab->image_address, bmp_header.size); if (!image) { + pr_err("Ignoring BGRT: failed to map image memory\n"); kfree(bgrt_image); bgrt_image = NULL; return; -- cgit v1.2.3 From b2e0a54a1296a91b800f316df7bef7d1905e4fd0 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 14 Aug 2014 17:15:26 +0800 Subject: efi: Move noefi early param code out of x86 arch code noefi param can be used for arches other than X86 later, thus move it out of x86 platform code. Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- Documentation/kernel-parameters.txt | 2 +- arch/x86/platform/efi/efi.c | 10 +--------- drivers/firmware/efi/efi.c | 13 +++++++++++++ include/linux/efi.h | 1 + 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 67d79597d9d6..08df275eee2b 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2169,7 +2169,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nodsp [SH] Disable hardware DSP at boot time. - noefi [X86] Disable EFI runtime services support. + noefi Disable EFI runtime services support. noexec [IA-64] diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index a1f745b0bf1d..c90d3cd2728c 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -70,14 +70,6 @@ static efi_config_table_type_t arch_tables[] __initdata = { u64 efi_setup; /* efi setup_data physical address */ -static bool disable_runtime __initdata = false; -static int __init setup_noefi(char *arg) -{ - disable_runtime = true; - return 0; -} -early_param("noefi", setup_noefi); - int add_efi_memmap; EXPORT_SYMBOL(add_efi_memmap); @@ -492,7 +484,7 @@ void __init efi_init(void) if (!efi_runtime_supported()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else { - if (disable_runtime || efi_runtime_init()) + if (efi_runtime_disabled() || efi_runtime_init()) return; } if (efi_memmap_init()) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 64ecbb501c50..c8f01a73edb5 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -41,6 +41,19 @@ struct efi __read_mostly efi = { }; EXPORT_SYMBOL(efi); +static bool disable_runtime; +static int __init setup_noefi(char *arg) +{ + disable_runtime = true; + return 0; +} +early_param("noefi", setup_noefi); + +bool efi_runtime_disabled(void) +{ + return disable_runtime; +} + static struct kobject *efi_kobj; static struct kobject *efivars_kobj; diff --git a/include/linux/efi.h b/include/linux/efi.h index 518779fb5e90..4812ed0b0374 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1229,4 +1229,5 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, efi_status_t efi_parse_options(char *cmdline); +bool efi_runtime_disabled(void); #endif /* _LINUX_EFI_H */ -- cgit v1.2.3 From 6ccc72b87b83ece31c2a75bbe07f440b0378f7a9 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 14 Aug 2014 17:15:27 +0800 Subject: lib: Add a generic cmdline parse function parse_option_str There should be a generic function to parse params like a=b,c Adding parse_option_str in lib/cmdline.c which will return true if there's specified option set in the params. Also updated efi=old_map parsing code to use the new function Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 22 ++-------------------- include/linux/kernel.h | 1 + lib/cmdline.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 32 insertions(+), 20 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index c90d3cd2728c..c73a7df5d37f 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -932,26 +932,8 @@ u64 efi_mem_attributes(unsigned long phys_addr) static int __init parse_efi_cmdline(char *str) { - if (*str == '=') - str++; - - while (*str) { - if (!strncmp(str, "old_map", 7)) { - set_bit(EFI_OLD_MEMMAP, &efi.flags); - str += strlen("old_map"); - } - - /* - * Skip any options we don't understand. Presumably - * they apply to the EFI boot stub. - */ - while (*str && *str != ',') - str++; - - /* If we hit a delimiter, skip it */ - if (*str == ',') - str++; - } + if (parse_option_str(str, "old_map")) + set_bit(EFI_OLD_MEMMAP, &efi.flags); return 0; } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 95624bed87ef..f66427ef0628 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -407,6 +407,7 @@ int vsscanf(const char *, const char *, va_list); extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); +extern bool parse_option_str(const char *str, const char *option); extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); diff --git a/lib/cmdline.c b/lib/cmdline.c index 76a712e6e20e..8f13cf73c2ec 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -160,3 +160,32 @@ unsigned long long memparse(const char *ptr, char **retptr) return ret; } EXPORT_SYMBOL(memparse); + +/** + * parse_option_str - Parse a string and check an option is set or not + * @str: String to be parsed + * @option: option name + * + * This function parses a string containing a comma-separated list of + * strings like a=b,c. + * + * Return true if there's such option in the string, or return false. + */ +bool parse_option_str(const char *str, const char *option) +{ + while (*str) { + if (!strncmp(str, option, strlen(option))) { + str += strlen(option); + if (!*str || *str == ',') + return true; + } + + while (*str && *str != ',') + str++; + + if (*str == ',') + str++; + } + + return false; +} -- cgit v1.2.3 From 5ae3683c380e78aebc60d710617ba2c0dccc9e84 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 14 Aug 2014 17:15:28 +0800 Subject: efi: Add kernel param efi=noruntime noefi kernel param means actually disabling efi runtime, Per suggestion from Leif Lindholm efi=noruntime should be better. But since noefi is already used in X86 thus just adding another param efi=noruntime for same purpose. Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- Documentation/kernel-parameters.txt | 3 ++- arch/x86/platform/efi/efi.c | 4 ++-- drivers/firmware/efi/efi.c | 9 +++++++++ 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 08df275eee2b..d0e431611a04 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -992,13 +992,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Format: {"off" | "on" | "skip[mbr]"} efi= [EFI] - Format: { "old_map", "nochunk" } + Format: { "old_map", "nochunk", "noruntime" } old_map [X86-64]: switch to the old ioremap-based EFI runtime services mapping. 32-bit still uses this one by default. nochunk: disable reading files in "chunks" in the EFI boot stub, as chunking can cause problems with some firmware implementations. + noruntime : disable EFI runtime services support efi_no_storage_paranoia [EFI; X86] Using this parameter you can use more than 50% of diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index c73a7df5d37f..00f4cc566adb 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -930,11 +930,11 @@ u64 efi_mem_attributes(unsigned long phys_addr) return 0; } -static int __init parse_efi_cmdline(char *str) +static int __init arch_parse_efi_cmdline(char *str) { if (parse_option_str(str, "old_map")) set_bit(EFI_OLD_MEMMAP, &efi.flags); return 0; } -early_param("efi", parse_efi_cmdline); +early_param("efi", arch_parse_efi_cmdline); diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index c8f01a73edb5..cebfa36a27ae 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -54,6 +54,15 @@ bool efi_runtime_disabled(void) return disable_runtime; } +static int __init parse_efi_cmdline(char *str) +{ + if (parse_option_str(str, "noruntime")) + disable_runtime = true; + + return 0; +} +early_param("efi", parse_efi_cmdline); + static struct kobject *efi_kobj; static struct kobject *efivars_kobj; -- cgit v1.2.3 From 88f8abd594082b9c08789e8527e4e38116a963ec Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 14 Aug 2014 17:15:29 +0800 Subject: arm64/efi: uefi_init error handling fix There's one early memmap leak in uefi_init error path, fix it and slightly tune the error handling code. Signed-off-by: Dave Young Acked-by: Mark Salter Reported-by: Will Deacon Acked-by: Will Deacon Signed-off-by: Matt Fleming --- arch/arm64/kernel/efi.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index e72f3100958f..6ed0362dd579 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -89,7 +89,8 @@ static int __init uefi_init(void) */ if (efi.systab->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { pr_err("System table signature incorrect\n"); - return -EINVAL; + retval = -EINVAL; + goto out; } if ((efi.systab->hdr.revision >> 16) < 2) pr_warn("Warning: EFI system table version %d.%02d, expected 2.00 or greater\n", @@ -103,6 +104,7 @@ static int __init uefi_init(void) for (i = 0; i < (int) sizeof(vendor) - 1 && *c16; ++i) vendor[i] = c16[i]; vendor[i] = '\0'; + early_memunmap(c16, sizeof(vendor)); } pr_info("EFI v%u.%.02u by %s\n", @@ -113,9 +115,8 @@ static int __init uefi_init(void) if (retval == 0) set_bit(EFI_CONFIG_TABLES, &efi.flags); - early_memunmap(c16, sizeof(vendor)); +out: early_memunmap(efi.systab, sizeof(efi_system_table_t)); - return retval; } -- cgit v1.2.3 From 6632210f50530ea53dd21f786f2854609d928689 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Mon, 18 Aug 2014 09:30:07 +0800 Subject: arm64/efi: Do not enter virtual mode if booting with efi=noruntime or noefi In case efi runtime disabled via noefi kernel cmdline arm64_enter_virtual_mode should error out. At the same time move early_memunmap(memmap.map, mapsize) to the beginning of the function or it will leak early mem. Signed-off-by: Dave Young Reviewed-by: Will Deacon Signed-off-by: Matt Fleming --- arch/arm64/kernel/efi.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 6ed0362dd579..8f5db4a3c9d9 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -392,11 +392,16 @@ static int __init arm64_enter_virtual_mode(void) return -1; } - pr_info("Remapping and enabling EFI services.\n"); - - /* replace early memmap mapping with permanent mapping */ mapsize = memmap.map_end - memmap.map; early_memunmap(memmap.map, mapsize); + + if (efi_runtime_disabled()) { + pr_info("EFI runtime services will be disabled.\n"); + return -1; + } + + pr_info("Remapping and enabling EFI services.\n"); + /* replace early memmap mapping with permanent mapping */ memmap.map = (__force void *)ioremap_cache((phys_addr_t)memmap.phys_map, mapsize); memmap.map_end = memmap.map + mapsize; -- cgit v1.2.3 From a5a750a98fe2812263546cc20badd32bab21ec52 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Thu, 14 Aug 2014 17:15:31 +0800 Subject: x86/efi: Clear EFI_RUNTIME_SERVICES if failing to enter virtual mode If enter virtual mode failed due to some reason other than the efi call the EFI_RUNTIME_SERVICES bit in efi.flags should be cleared thus users of efi runtime services can check the bit and handle the case instead of assume efi runtime is ok. Per Matt, if efi call SetVirtualAddressMap fails we will be not sure it's safe to make any assumptions about the state of the system. So kernel panics instead of clears EFI_RUNTIME_SERVICES bit. Signed-off-by: Dave Young Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 00f4cc566adb..d3096c0aa941 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -724,6 +724,7 @@ static void __init kexec_enter_virtual_mode(void) */ if (!efi_is_native()) { efi_unmap_memmap(); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -797,6 +798,7 @@ static void __init __efi_enter_virtual_mode(void) new_memmap = efi_map_regions(&count, &pg_shift); if (!new_memmap) { pr_err("Error reallocating memory, EFI runtime non-functional!\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -804,8 +806,10 @@ static void __init __efi_enter_virtual_mode(void) BUG_ON(!efi.systab); - if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) + if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) { + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; + } efi_sync_low_kernel_mappings(); efi_dump_pagetable(); -- cgit v1.2.3 From 9c97e0bdd4b4ae44577a1b1ec949e782084e9a78 Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 3 Sep 2014 13:32:19 +0200 Subject: efi: Add macro for EFI_MEMORY_UCE memory attribute Add the following macro from the UEFI spec, for completeness: EFI_MEMORY_UCE Memory cacheability attribute: The memory region supports being configured as not cacheable, exported, and supports the "fetch and add" semaphore mechanism. Signed-off-by: Laszlo Ersek Tested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel Signed-off-by: Matt Fleming --- include/linux/efi.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/efi.h b/include/linux/efi.h index 4812ed0b0374..7464032ae00a 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -92,6 +92,7 @@ typedef struct { #define EFI_MEMORY_WC ((u64)0x0000000000000002ULL) /* write-coalescing */ #define EFI_MEMORY_WT ((u64)0x0000000000000004ULL) /* write-through */ #define EFI_MEMORY_WB ((u64)0x0000000000000008ULL) /* write-back */ +#define EFI_MEMORY_UCE ((u64)0x0000000000000010ULL) /* uncached, exported */ #define EFI_MEMORY_WP ((u64)0x0000000000001000ULL) /* write-protect */ #define EFI_MEMORY_RP ((u64)0x0000000000002000ULL) /* read-protect */ #define EFI_MEMORY_XP ((u64)0x0000000000004000ULL) /* execute-protect */ -- cgit v1.2.3 From 98d2a6ca14520904a47c46258d3bad02ffcd3f96 Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 3 Sep 2014 13:32:20 +0200 Subject: efi: Introduce efi_md_typeattr_format() At the moment, there are three architectures debug-printing the EFI memory map at initialization: x86, ia64, and arm64. They all use different format strings, plus the EFI memory type and the EFI memory attributes are similarly hard to decode for a human reader. Introduce a helper __init function that formats the memory type and the memory attributes in a unified way, to a user-provided character buffer. The array "memory_type_name" is copied from the arm64 code, temporarily duplicating it. The (otherwise optional) braces around each string literal in the initializer list are dropped in order to match the kernel coding style more closely. The element size is tightened from 32 to 20 bytes (maximum actual string length + 1) so that we can derive the field width from the element size. Signed-off-by: Laszlo Ersek Tested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel [ Dropped useless 'register' keyword, which compiler will ignore ] Signed-off-by: Matt Fleming --- drivers/firmware/efi/efi.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/efi.h | 7 ++++++ 2 files changed, 64 insertions(+) diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index cebfa36a27ae..8590099ac148 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -445,3 +445,60 @@ int __init efi_get_fdt_params(struct efi_fdt_params *params, int verbose) return ret; } #endif /* CONFIG_EFI_PARAMS_FROM_FDT */ + +static __initdata char memory_type_name[][20] = { + "Reserved", + "Loader Code", + "Loader Data", + "Boot Code", + "Boot Data", + "Runtime Code", + "Runtime Data", + "Conventional Memory", + "Unusable Memory", + "ACPI Reclaim Memory", + "ACPI Memory NVS", + "Memory Mapped I/O", + "MMIO Port Space", + "PAL Code" +}; + +char * __init efi_md_typeattr_format(char *buf, size_t size, + const efi_memory_desc_t *md) +{ + char *pos; + int type_len; + u64 attr; + + pos = buf; + if (md->type >= ARRAY_SIZE(memory_type_name)) + type_len = snprintf(pos, size, "[type=%u", md->type); + else + type_len = snprintf(pos, size, "[%-*s", + (int)(sizeof(memory_type_name[0]) - 1), + memory_type_name[md->type]); + if (type_len >= size) + return buf; + + pos += type_len; + size -= type_len; + + attr = md->attribute; + if (attr & ~(EFI_MEMORY_UC | EFI_MEMORY_WC | EFI_MEMORY_WT | + EFI_MEMORY_WB | EFI_MEMORY_UCE | EFI_MEMORY_WP | + EFI_MEMORY_RP | EFI_MEMORY_XP | EFI_MEMORY_RUNTIME)) + snprintf(pos, size, "|attr=0x%016llx]", + (unsigned long long)attr); + else + snprintf(pos, size, "|%3s|%2s|%2s|%2s|%3s|%2s|%2s|%2s|%2s]", + attr & EFI_MEMORY_RUNTIME ? "RUN" : "", + attr & EFI_MEMORY_XP ? "XP" : "", + attr & EFI_MEMORY_RP ? "RP" : "", + attr & EFI_MEMORY_WP ? "WP" : "", + attr & EFI_MEMORY_UCE ? "UCE" : "", + attr & EFI_MEMORY_WB ? "WB" : "", + attr & EFI_MEMORY_WT ? "WT" : "", + attr & EFI_MEMORY_WC ? "WC" : "", + attr & EFI_MEMORY_UC ? "UC" : ""); + return buf; +} diff --git a/include/linux/efi.h b/include/linux/efi.h index 7464032ae00a..78b29b133e14 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -887,6 +887,13 @@ extern bool efi_poweroff_required(void); (md) <= (efi_memory_desc_t *)((m)->map_end - (m)->desc_size); \ (md) = (void *)(md) + (m)->desc_size) +/* + * Format an EFI memory descriptor's type and attributes to a user-provided + * character buffer, as per snprintf(), and return the buffer. + */ +char * __init efi_md_typeattr_format(char *buf, size_t size, + const efi_memory_desc_t *md); + /** * efi_range_is_wc - check the WC bit on an address range * @start: starting kvirt address -- cgit v1.2.3 From ace1d1218da536f3138e6e702788bd5d62ba7eca Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 3 Sep 2014 13:32:21 +0200 Subject: x86: efi: Format EFI memory type & attrs with efi_md_typeattr_format() An example log excerpt demonstrating the change: Before the patch: > efi: mem00: type=7, attr=0xf, range=[0x0000000000000000-0x000000000009f000) (0MB) > efi: mem01: type=2, attr=0xf, range=[0x000000000009f000-0x00000000000a0000) (0MB) > efi: mem02: type=7, attr=0xf, range=[0x0000000000100000-0x0000000000400000) (3MB) > efi: mem03: type=2, attr=0xf, range=[0x0000000000400000-0x0000000000800000) (4MB) > efi: mem04: type=10, attr=0xf, range=[0x0000000000800000-0x0000000000808000) (0MB) > efi: mem05: type=7, attr=0xf, range=[0x0000000000808000-0x0000000000810000) (0MB) > efi: mem06: type=10, attr=0xf, range=[0x0000000000810000-0x0000000000900000) (0MB) > efi: mem07: type=4, attr=0xf, range=[0x0000000000900000-0x0000000001100000) (8MB) > efi: mem08: type=7, attr=0xf, range=[0x0000000001100000-0x0000000001400000) (3MB) > efi: mem09: type=2, attr=0xf, range=[0x0000000001400000-0x0000000002613000) (18MB) > efi: mem10: type=7, attr=0xf, range=[0x0000000002613000-0x0000000004000000) (25MB) > efi: mem11: type=4, attr=0xf, range=[0x0000000004000000-0x0000000004020000) (0MB) > efi: mem12: type=7, attr=0xf, range=[0x0000000004020000-0x00000000068ea000) (40MB) > efi: mem13: type=2, attr=0xf, range=[0x00000000068ea000-0x00000000068f0000) (0MB) > efi: mem14: type=3, attr=0xf, range=[0x00000000068f0000-0x0000000006c7b000) (3MB) > efi: mem15: type=6, attr=0x800000000000000f, range=[0x0000000006c7b000-0x0000000006c7d000) (0MB) > efi: mem16: type=5, attr=0x800000000000000f, range=[0x0000000006c7d000-0x0000000006c85000) (0MB) > efi: mem17: type=6, attr=0x800000000000000f, range=[0x0000000006c85000-0x0000000006c87000) (0MB) > efi: mem18: type=3, attr=0xf, range=[0x0000000006c87000-0x0000000006ca3000) (0MB) > efi: mem19: type=6, attr=0x800000000000000f, range=[0x0000000006ca3000-0x0000000006ca6000) (0MB) > efi: mem20: type=10, attr=0xf, range=[0x0000000006ca6000-0x0000000006cc6000) (0MB) > efi: mem21: type=6, attr=0x800000000000000f, range=[0x0000000006cc6000-0x0000000006d95000) (0MB) > efi: mem22: type=5, attr=0x800000000000000f, range=[0x0000000006d95000-0x0000000006e22000) (0MB) > efi: mem23: type=7, attr=0xf, range=[0x0000000006e22000-0x0000000007165000) (3MB) > efi: mem24: type=4, attr=0xf, range=[0x0000000007165000-0x0000000007d22000) (11MB) > efi: mem25: type=7, attr=0xf, range=[0x0000000007d22000-0x0000000007d25000) (0MB) > efi: mem26: type=3, attr=0xf, range=[0x0000000007d25000-0x0000000007ea2000) (1MB) > efi: mem27: type=5, attr=0x800000000000000f, range=[0x0000000007ea2000-0x0000000007ed2000) (0MB) > efi: mem28: type=6, attr=0x800000000000000f, range=[0x0000000007ed2000-0x0000000007ef6000) (0MB) > efi: mem29: type=7, attr=0xf, range=[0x0000000007ef6000-0x0000000007f00000) (0MB) > efi: mem30: type=9, attr=0xf, range=[0x0000000007f00000-0x0000000007f02000) (0MB) > efi: mem31: type=10, attr=0xf, range=[0x0000000007f02000-0x0000000007f06000) (0MB) > efi: mem32: type=4, attr=0xf, range=[0x0000000007f06000-0x0000000007fd0000) (0MB) > efi: mem33: type=6, attr=0x800000000000000f, range=[0x0000000007fd0000-0x0000000007ff0000) (0MB) > efi: mem34: type=7, attr=0xf, range=[0x0000000007ff0000-0x0000000008000000) (0MB) After the patch: > efi: mem00: [Conventional Memory| | | | | |WB|WT|WC|UC] range=[0x0000000000000000-0x000000000009f000) (0MB) > efi: mem01: [Loader Data | | | | | |WB|WT|WC|UC] range=[0x000000000009f000-0x00000000000a0000) (0MB) > efi: mem02: [Conventional Memory| | | | | |WB|WT|WC|UC] range=[0x0000000000100000-0x0000000000400000) (3MB) > efi: mem03: [Loader Data | | | | | |WB|WT|WC|UC] range=[0x0000000000400000-0x0000000000800000) (4MB) > efi: mem04: [ACPI Memory NVS | | | | | |WB|WT|WC|UC] range=[0x0000000000800000-0x0000000000808000) (0MB) > efi: mem05: [Conventional Memory| | | | | |WB|WT|WC|UC] range=[0x0000000000808000-0x0000000000810000) (0MB) > efi: mem06: [ACPI Memory NVS | | | | | |WB|WT|WC|UC] range=[0x0000000000810000-0x0000000000900000) (0MB) > efi: mem07: [Boot Data | | | | | |WB|WT|WC|UC] range=[0x0000000000900000-0x0000000001100000) (8MB) > efi: mem08: [Conventional Memory| | | | | |WB|WT|WC|UC] range=[0x0000000001100000-0x0000000001400000) (3MB) > efi: mem09: [Loader Data | | | | | |WB|WT|WC|UC] range=[0x0000000001400000-0x0000000002613000) (18MB) > efi: mem10: [Conventional Memory| | | | | |WB|WT|WC|UC] range=[0x0000000002613000-0x0000000004000000) (25MB) > efi: mem11: [Boot Data | | | | | |WB|WT|WC|UC] range=[0x0000000004000000-0x0000000004020000) (0MB) > efi: mem12: [Conventional Memory| | | | | |WB|WT|WC|UC] range=[0x0000000004020000-0x00000000068ea000) (40MB) > efi: mem13: [Loader Data | | | | | |WB|WT|WC|UC] range=[0x00000000068ea000-0x00000000068f0000) (0MB) > efi: mem14: [Boot Code | | | | | |WB|WT|WC|UC] range=[0x00000000068f0000-0x0000000006c7b000) (3MB) > efi: mem15: [Runtime Data |RUN| | | | |WB|WT|WC|UC] range=[0x0000000006c7b000-0x0000000006c7d000) (0MB) > efi: mem16: [Runtime Code |RUN| | | | |WB|WT|WC|UC] range=[0x0000000006c7d000-0x0000000006c85000) (0MB) > efi: mem17: [Runtime Data |RUN| | | | |WB|WT|WC|UC] range=[0x0000000006c85000-0x0000000006c87000) (0MB) > efi: mem18: [Boot Code | | | | | |WB|WT|WC|UC] range=[0x0000000006c87000-0x0000000006ca3000) (0MB) > efi: mem19: [Runtime Data |RUN| | | | |WB|WT|WC|UC] range=[0x0000000006ca3000-0x0000000006ca6000) (0MB) > efi: mem20: [ACPI Memory NVS | | | | | |WB|WT|WC|UC] range=[0x0000000006ca6000-0x0000000006cc6000) (0MB) > efi: mem21: [Runtime Data |RUN| | | | |WB|WT|WC|UC] range=[0x0000000006cc6000-0x0000000006d95000) (0MB) > efi: mem22: [Runtime Code |RUN| | | | |WB|WT|WC|UC] range=[0x0000000006d95000-0x0000000006e22000) (0MB) > efi: mem23: [Conventional Memory| | | | | |WB|WT|WC|UC] range=[0x0000000006e22000-0x0000000007165000) (3MB) > efi: mem24: [Boot Data | | | | | |WB|WT|WC|UC] range=[0x0000000007165000-0x0000000007d22000) (11MB) > efi: mem25: [Conventional Memory| | | | | |WB|WT|WC|UC] range=[0x0000000007d22000-0x0000000007d25000) (0MB) > efi: mem26: [Boot Code | | | | | |WB|WT|WC|UC] range=[0x0000000007d25000-0x0000000007ea2000) (1MB) > efi: mem27: [Runtime Code |RUN| | | | |WB|WT|WC|UC] range=[0x0000000007ea2000-0x0000000007ed2000) (0MB) > efi: mem28: [Runtime Data |RUN| | | | |WB|WT|WC|UC] range=[0x0000000007ed2000-0x0000000007ef6000) (0MB) > efi: mem29: [Conventional Memory| | | | | |WB|WT|WC|UC] range=[0x0000000007ef6000-0x0000000007f00000) (0MB) > efi: mem30: [ACPI Reclaim Memory| | | | | |WB|WT|WC|UC] range=[0x0000000007f00000-0x0000000007f02000) (0MB) > efi: mem31: [ACPI Memory NVS | | | | | |WB|WT|WC|UC] range=[0x0000000007f02000-0x0000000007f06000) (0MB) > efi: mem32: [Boot Data | | | | | |WB|WT|WC|UC] range=[0x0000000007f06000-0x0000000007fd0000) (0MB) > efi: mem33: [Runtime Data |RUN| | | | |WB|WT|WC|UC] range=[0x0000000007fd0000-0x0000000007ff0000) (0MB) > efi: mem34: [Conventional Memory| | | | | |WB|WT|WC|UC] range=[0x0000000007ff0000-0x0000000008000000) (0MB) Both the type enum and the attribute bitmap are decoded, with the additional benefit that the memory ranges line up as well. Signed-off-by: Laszlo Ersek Acked-by: Ard Biesheuvel Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index d3096c0aa941..f0be3d1da3de 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -202,9 +202,12 @@ static void __init print_efi_memmap(void) for (p = memmap.map, i = 0; p < memmap.map_end; p += memmap.desc_size, i++) { + char buf[64]; + md = p; - pr_info("mem%02u: type=%u, attr=0x%llx, range=[0x%016llx-0x%016llx) (%lluMB)\n", - i, md->type, md->attribute, md->phys_addr, + pr_info("mem%02u: %s range=[0x%016llx-0x%016llx) (%lluMB)\n", + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT), (md->num_pages >> (20 - EFI_PAGE_SHIFT))); } -- cgit v1.2.3 From 77b12bcfc532b6bb9adc0f898c6b08216056eb89 Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 3 Sep 2014 13:32:22 +0200 Subject: ia64: efi: Format EFI memory type & attrs with efi_md_typeattr_format() The effects of the patch on the i64 memory map log are similar to those visible in the previous (x86) patch: the type enum and the attribute bitmap are decoded. Signed-off-by: Laszlo Ersek Acked-by: Ard Biesheuvel Signed-off-by: Matt Fleming --- arch/ia64/kernel/efi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c index 741b99c1a0b1..c52d7540dc05 100644 --- a/arch/ia64/kernel/efi.c +++ b/arch/ia64/kernel/efi.c @@ -568,6 +568,7 @@ efi_init (void) { const char *unit; unsigned long size; + char buf[64]; md = p; size = md->num_pages << EFI_PAGE_SHIFT; @@ -586,9 +587,10 @@ efi_init (void) unit = "KB"; } - printk("mem%02d: type=%2u, attr=0x%016lx, " + printk("mem%02d: %s " "range=[0x%016lx-0x%016lx) (%4lu%s)\n", - i, md->type, md->attribute, md->phys_addr, + i, efi_md_typeattr_format(buf, sizeof(buf), md), + md->phys_addr, md->phys_addr + efi_md_size(md), size, unit); } } -- cgit v1.2.3 From 65ba758f3e7d99c49b5710f6010bc6ba1e50d16c Mon Sep 17 00:00:00 2001 From: Laszlo Ersek Date: Wed, 3 Sep 2014 13:32:23 +0200 Subject: arm64: efi: Format EFI memory type & attrs with efi_md_typeattr_format() An example log excerpt demonstrating the change: Before the patch: > Processing EFI memory map: > 0x000040000000-0x000040000fff [Loader Data] > 0x000040001000-0x00004007ffff [Conventional Memory] > 0x000040080000-0x00004072afff [Loader Data] > 0x00004072b000-0x00005fdfffff [Conventional Memory] > 0x00005fe00000-0x00005fe0ffff [Loader Data] > 0x00005fe10000-0x0000964e8fff [Conventional Memory] > 0x0000964e9000-0x0000964e9fff [Loader Data] > 0x0000964ea000-0x000096c52fff [Loader Code] > 0x000096c53000-0x00009709dfff [Boot Code]* > 0x00009709e000-0x0000970b3fff [Runtime Code]* > 0x0000970b4000-0x0000970f4fff [Runtime Data]* > 0x0000970f5000-0x000097117fff [Runtime Code]* > 0x000097118000-0x000097199fff [Runtime Data]* > 0x00009719a000-0x0000971dffff [Runtime Code]* > 0x0000971e0000-0x0000997f8fff [Conventional Memory] > 0x0000997f9000-0x0000998f1fff [Boot Data]* > 0x0000998f2000-0x0000999eafff [Conventional Memory] > 0x0000999eb000-0x00009af09fff [Boot Data]* > 0x00009af0a000-0x00009af21fff [Conventional Memory] > 0x00009af22000-0x00009af46fff [Boot Data]* > 0x00009af47000-0x00009af5bfff [Conventional Memory] > 0x00009af5c000-0x00009afe1fff [Boot Data]* > 0x00009afe2000-0x00009afe2fff [Conventional Memory] > 0x00009afe3000-0x00009c01ffff [Boot Data]* > 0x00009c020000-0x00009efbffff [Conventional Memory] > 0x00009efc0000-0x00009f14efff [Boot Code]* > 0x00009f14f000-0x00009f162fff [Runtime Code]* > 0x00009f163000-0x00009f194fff [Runtime Data]* > 0x00009f195000-0x00009f197fff [Boot Data]* > 0x00009f198000-0x00009f198fff [Runtime Data]* > 0x00009f199000-0x00009f1acfff [Conventional Memory] > 0x00009f1ad000-0x00009f1affff [Boot Data]* > 0x00009f1b0000-0x00009f1b0fff [Runtime Data]* > 0x00009f1b1000-0x00009fffffff [Boot Data]* > 0x000004000000-0x000007ffffff [Memory Mapped I/O] > 0x000009010000-0x000009010fff [Memory Mapped I/O] After the patch: > Processing EFI memory map: > 0x000040000000-0x000040000fff [Loader Data | | | | | |WB|WT|WC|UC] > 0x000040001000-0x00004007ffff [Conventional Memory| | | | | |WB|WT|WC|UC] > 0x000040080000-0x00004072afff [Loader Data | | | | | |WB|WT|WC|UC] > 0x00004072b000-0x00005fdfffff [Conventional Memory| | | | | |WB|WT|WC|UC] > 0x00005fe00000-0x00005fe0ffff [Loader Data | | | | | |WB|WT|WC|UC] > 0x00005fe10000-0x0000964e8fff [Conventional Memory| | | | | |WB|WT|WC|UC] > 0x0000964e9000-0x0000964e9fff [Loader Data | | | | | |WB|WT|WC|UC] > 0x0000964ea000-0x000096c52fff [Loader Code | | | | | |WB|WT|WC|UC] > 0x000096c53000-0x00009709dfff [Boot Code | | | | | |WB|WT|WC|UC]* > 0x00009709e000-0x0000970b3fff [Runtime Code |RUN| | | | |WB|WT|WC|UC]* > 0x0000970b4000-0x0000970f4fff [Runtime Data |RUN| | | | |WB|WT|WC|UC]* > 0x0000970f5000-0x000097117fff [Runtime Code |RUN| | | | |WB|WT|WC|UC]* > 0x000097118000-0x000097199fff [Runtime Data |RUN| | | | |WB|WT|WC|UC]* > 0x00009719a000-0x0000971dffff [Runtime Code |RUN| | | | |WB|WT|WC|UC]* > 0x0000971e0000-0x0000997f8fff [Conventional Memory| | | | | |WB|WT|WC|UC] > 0x0000997f9000-0x0000998f1fff [Boot Data | | | | | |WB|WT|WC|UC]* > 0x0000998f2000-0x0000999eafff [Conventional Memory| | | | | |WB|WT|WC|UC] > 0x0000999eb000-0x00009af09fff [Boot Data | | | | | |WB|WT|WC|UC]* > 0x00009af0a000-0x00009af21fff [Conventional Memory| | | | | |WB|WT|WC|UC] > 0x00009af22000-0x00009af46fff [Boot Data | | | | | |WB|WT|WC|UC]* > 0x00009af47000-0x00009af5bfff [Conventional Memory| | | | | |WB|WT|WC|UC] > 0x00009af5c000-0x00009afe1fff [Boot Data | | | | | |WB|WT|WC|UC]* > 0x00009afe2000-0x00009afe2fff [Conventional Memory| | | | | |WB|WT|WC|UC] > 0x00009afe3000-0x00009c01ffff [Boot Data | | | | | |WB|WT|WC|UC]* > 0x00009c020000-0x00009efbffff [Conventional Memory| | | | | |WB|WT|WC|UC] > 0x00009efc0000-0x00009f14efff [Boot Code | | | | | |WB|WT|WC|UC]* > 0x00009f14f000-0x00009f162fff [Runtime Code |RUN| | | | |WB|WT|WC|UC]* > 0x00009f163000-0x00009f194fff [Runtime Data |RUN| | | | |WB|WT|WC|UC]* > 0x00009f195000-0x00009f197fff [Boot Data | | | | | |WB|WT|WC|UC]* > 0x00009f198000-0x00009f198fff [Runtime Data |RUN| | | | |WB|WT|WC|UC]* > 0x00009f199000-0x00009f1acfff [Conventional Memory| | | | | |WB|WT|WC|UC] > 0x00009f1ad000-0x00009f1affff [Boot Data | | | | | |WB|WT|WC|UC]* > 0x00009f1b0000-0x00009f1b0fff [Runtime Data |RUN| | | | |WB|WT|WC|UC]* > 0x00009f1b1000-0x00009fffffff [Boot Data | | | | | |WB|WT|WC|UC]* > 0x000004000000-0x000007ffffff [Memory Mapped I/O |RUN| | | | | | | |UC] > 0x000009010000-0x000009010fff [Memory Mapped I/O |RUN| | | | | | | |UC] The attribute bitmap is now displayed, in decoded form. Signed-off-by: Laszlo Ersek Tested-by: Ard Biesheuvel Acked-by: Ard Biesheuvel Signed-off-by: Matt Fleming --- arch/arm64/kernel/efi.c | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 8f5db4a3c9d9..865fdf5c7344 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -120,23 +120,6 @@ out: return retval; } -static __initdata char memory_type_name[][32] = { - {"Reserved"}, - {"Loader Code"}, - {"Loader Data"}, - {"Boot Code"}, - {"Boot Data"}, - {"Runtime Code"}, - {"Runtime Data"}, - {"Conventional Memory"}, - {"Unusable Memory"}, - {"ACPI Reclaim Memory"}, - {"ACPI Memory NVS"}, - {"Memory Mapped I/O"}, - {"MMIO Port Space"}, - {"PAL Code"}, -}; - /* * Return true for RAM regions we want to permanently reserve. */ @@ -167,10 +150,13 @@ static __init void reserve_regions(void) paddr = md->phys_addr; npages = md->num_pages; - if (uefi_debug) - pr_info(" 0x%012llx-0x%012llx [%s]", + if (uefi_debug) { + char buf[64]; + + pr_info(" 0x%012llx-0x%012llx %s", paddr, paddr + (npages << EFI_PAGE_SHIFT) - 1, - memory_type_name[md->type]); + efi_md_typeattr_format(buf, sizeof(buf), md)); + } memrange_efi_to_native(&paddr, &npages); size = npages << PAGE_SHIFT; -- cgit v1.2.3 From b2fce819a841eed21034c10a6fe3a8f43532dfb2 Mon Sep 17 00:00:00 2001 From: Mark Rustad Date: Sat, 6 Sep 2014 06:02:53 -0700 Subject: efi: Resolve some shadow warnings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is a really bad idea to declare variables or parameters that have the same name as common types. It is valid C, but it gets surprising if a macro expansion attempts to declare an inner local with that type. Change the local names to eliminate the hazard. Change s16 => str16, s8 => str8. This resolves warnings seen when using W=2 during make, for instance: drivers/firmware/efi/vars.c: In function ‘dup_variable_bug’: drivers/firmware/efi/vars.c:324:44: warning: declaration of ‘s16’ shadows a global declaration [-Wshadow] static void dup_variable_bug(efi_char16_t *s16, efi_guid_t *vendor_guid, drivers/firmware/efi/vars.c:328:8: warning: declaration of ‘s8’ shadows a global declaration [-Wshadow] char *s8; Signed-off-by: Mark Rustad Signed-off-by: Jeff Kirsher Signed-off-by: Matt Fleming --- drivers/firmware/efi/vars.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index f0a43646a2f3..1fa724f31b0e 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -321,11 +321,11 @@ static unsigned long var_name_strnsize(efi_char16_t *variable_name, * Print a warning when duplicate EFI variables are encountered and * disable the sysfs workqueue since the firmware is buggy. */ -static void dup_variable_bug(efi_char16_t *s16, efi_guid_t *vendor_guid, +static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid, unsigned long len16) { size_t i, len8 = len16 / sizeof(efi_char16_t); - char *s8; + char *str8; /* * Disable the workqueue since the algorithm it uses for @@ -334,16 +334,16 @@ static void dup_variable_bug(efi_char16_t *s16, efi_guid_t *vendor_guid, */ efivar_wq_enabled = false; - s8 = kzalloc(len8, GFP_KERNEL); - if (!s8) + str8 = kzalloc(len8, GFP_KERNEL); + if (!str8) return; for (i = 0; i < len8; i++) - s8[i] = s16[i]; + str8[i] = str16[i]; printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n", - s8, vendor_guid); - kfree(s8); + str8, vendor_guid); + kfree(str8); } /** -- cgit v1.2.3 From 24ffd84b606dafa93a5f8f6e6dbaa06fbde11632 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 7 Sep 2014 19:42:14 +0200 Subject: x86/efi: Remove unused efi_call* macros Complement commit 62fa6e69a436 ("x86/efi: Delete most of the efi_call* macros") and delete the stub macros for the !CONFIG_EFI case, too. In fact, there are no EFI calls in this case so we don't need a dummy for efi_call() even. Signed-off-by: Mathias Krause Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index b39ee5f2c02d..5375df643818 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -188,16 +188,6 @@ extern struct efi_config *efi_early; extern bool efi_reboot_required(void); #else -/* - * IF EFI is not configured, have the EFI calls return -ENOSYS. - */ -#define efi_call0(_f) (-ENOSYS) -#define efi_call1(_f, _a1) (-ENOSYS) -#define efi_call2(_f, _a1, _a2) (-ENOSYS) -#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS) -#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS) -#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS) -#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS) static inline void parse_efi_setup(u64 phys_addr, u32 data_len) {} static inline bool efi_reboot_required(void) { -- cgit v1.2.3 From 609206854767559ecbb1c308bbd6ef7af13d8c6e Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 7 Sep 2014 19:42:15 +0200 Subject: x86/efi: Unexport add_efi_memmap variable This variable was accidentally exported, even though it's only used in this compilation unit and only during initialization. Remove the bogus export, make the variable static instead and mark it as __initdata. Fixes: 200001eb140e ("x86 boot: only pick up additional EFI memmap...") Cc: Paul Jackson Signed-off-by: Mathias Krause Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 1 - arch/x86/platform/efi/efi.c | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 5375df643818..ce7b1c5ff78d 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -88,7 +88,6 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, #define efi_in_nmi() in_nmi() -extern int add_efi_memmap; extern struct efi_scratch efi_scratch; extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern int efi_memblock_x86_reserve_range(void); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f0be3d1da3de..03938d17ad6d 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -70,9 +70,7 @@ static efi_config_table_type_t arch_tables[] __initdata = { u64 efi_setup; /* efi setup_data physical address */ -int add_efi_memmap; -EXPORT_SYMBOL(add_efi_memmap); - +static int add_efi_memmap __initdata; static int __init setup_add_efi_memmap(char *arg) { add_efi_memmap = 1; -- cgit v1.2.3 From 0ce4605c9aa7b9c25bb4d117eb07db67cd7f1991 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 7 Sep 2014 19:42:16 +0200 Subject: x86/efi: Update comment regarding required phys mapped EFI services Commit 3f4a7836e331 ("x86/efi: Rip out phys_efi_get_time()") left set_virtual_address_map as the only runtime service needed with a phys mapping but missed to update the preceding comment. Fix that. Signed-off-by: Mathias Krause Signed-off-by: Matt Fleming --- arch/x86/platform/efi/efi.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 03938d17ad6d..1bc2c878e42e 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -337,9 +337,9 @@ static int __init efi_runtime_init32(void) } /* - * We will only need *early* access to the following two - * EFI runtime services before set_virtual_address_map - * is invoked. + * We will only need *early* access to the SetVirtualAddressMap + * EFI runtime service. All other runtime services will be called + * via the virtual mapping. */ efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) @@ -361,9 +361,9 @@ static int __init efi_runtime_init64(void) } /* - * We will only need *early* access to the following two - * EFI runtime services before set_virtual_address_map - * is invoked. + * We will only need *early* access to the SetVirtualAddressMap + * EFI runtime service. All other runtime services will be called + * via the virtual mapping. */ efi_phys.set_virtual_address_map = (efi_set_virtual_address_map_t *) -- cgit v1.2.3 From 4e78eb056136b002ecdfbbf61436fedfb8a3c76b Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Sun, 7 Sep 2014 19:42:17 +0200 Subject: x86/efi: Mark initialization code as such The 32 bit and 64 bit implementations differ in their __init annotations for some functions referenced from the common EFI code. Namely, the 32 bit variant is missing some of the __init annotations the 64 bit variant has. To solve the colliding annotations, mark the corresponding functions in efi_32.c as initialization code, too -- as it is such. Actually, quite a few more functions are only used during initialization and therefore can be marked __init. They are therefore annotated, too. Also add the __init annotation to the prototypes in the efi.h header so users of those functions will see it's meant as initialization code only. This patch also fixes the "prelog" typo. ("prologue" / "epilogue" might be more appropriate but this is C code after all, not an opera! :D) Signed-off-by: Mathias Krause Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 20 ++++++++++---------- arch/x86/platform/efi/efi.c | 4 ++-- arch/x86/platform/efi/efi_32.c | 12 +++++++----- arch/x86/platform/efi/efi_64.c | 6 +++--- arch/x86/platform/efi/efi_stub_32.S | 4 ++-- 5 files changed, 24 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index ce7b1c5ff78d..abba1fe1db5e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -81,25 +81,25 @@ extern u64 asmlinkage efi_call(void *fp, ...); */ #define __efi_call_virt(f, args...) efi_call_virt(f, args) -extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size, - u32 type, u64 attribute); +extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, + u32 type, u64 attribute); #endif /* CONFIG_X86_32 */ #define efi_in_nmi() in_nmi() extern struct efi_scratch efi_scratch; -extern void efi_set_executable(efi_memory_desc_t *md, bool executable); -extern int efi_memblock_x86_reserve_range(void); -extern void efi_call_phys_prelog(void); -extern void efi_call_phys_epilog(void); -extern void efi_unmap_memmap(void); -extern void efi_memory_uc(u64 addr, unsigned long size); +extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); +extern int __init efi_memblock_x86_reserve_range(void); +extern void __init efi_call_phys_prolog(void); +extern void __init efi_call_phys_epilog(void); +extern void __init efi_unmap_memmap(void); +extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); -extern int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages); -extern void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages); +extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages); +extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages); extern void __init old_map_region(efi_memory_desc_t *md); extern void __init runtime_code_page_mkexec(void); extern void __init efi_runtime_mkexec(void); diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1bc2c878e42e..dbc8627a5cdf 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -86,7 +86,7 @@ static efi_status_t __init phys_efi_set_virtual_address_map( { efi_status_t status; - efi_call_phys_prelog(); + efi_call_phys_prolog(); status = efi_call_phys(efi_phys.set_virtual_address_map, memory_map_size, descriptor_size, descriptor_version, virtual_map); @@ -530,7 +530,7 @@ void __init runtime_code_page_mkexec(void) } } -void efi_memory_uc(u64 addr, unsigned long size) +void __init efi_memory_uc(u64 addr, unsigned long size) { unsigned long page_shift = 1UL << EFI_PAGE_SHIFT; u64 npages; diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 9ee3491e31fb..40e7cda52936 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -33,7 +33,7 @@ /* * To make EFI call EFI runtime service in physical addressing mode we need - * prelog/epilog before/after the invocation to disable interrupt, to + * prolog/epilog before/after the invocation to disable interrupt, to * claim EFI runtime service handler exclusively and to duplicate a memory in * low memory space say 0 - 3G. */ @@ -41,11 +41,13 @@ static unsigned long efi_rt_eflags; void efi_sync_low_kernel_mappings(void) {} void __init efi_dump_pagetable(void) {} -int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) +int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { return 0; } -void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) {} +void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) +{ +} void __init efi_map_region(efi_memory_desc_t *md) { @@ -55,7 +57,7 @@ void __init efi_map_region(efi_memory_desc_t *md) void __init efi_map_region_fixed(efi_memory_desc_t *md) {} void __init parse_efi_setup(u64 phys_addr, u32 data_len) {} -void efi_call_phys_prelog(void) +void __init efi_call_phys_prolog(void) { struct desc_ptr gdt_descr; @@ -69,7 +71,7 @@ void efi_call_phys_prelog(void) load_gdt(&gdt_descr); } -void efi_call_phys_epilog(void) +void __init efi_call_phys_epilog(void) { struct desc_ptr gdt_descr; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 290d397e1dd9..35aecb6042fb 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -79,7 +79,7 @@ static void __init early_code_mapping_set_exec(int executable) } } -void __init efi_call_phys_prelog(void) +void __init efi_call_phys_prolog(void) { unsigned long vaddress; int pgd; @@ -139,7 +139,7 @@ void efi_sync_low_kernel_mappings(void) sizeof(pgd_t) * num_pgds); } -int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) +int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { unsigned long text; struct page *page; @@ -192,7 +192,7 @@ int efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 0; } -void efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) +void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) { pgd_t *pgd = (pgd_t *)__va(real_mode_header->trampoline_pgd); diff --git a/arch/x86/platform/efi/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S index fbe66e626c09..040192b50d02 100644 --- a/arch/x86/platform/efi/efi_stub_32.S +++ b/arch/x86/platform/efi/efi_stub_32.S @@ -27,13 +27,13 @@ ENTRY(efi_call_phys) * set to 0x0010, DS and SS have been set to 0x0018. In EFI, I found * the values of these registers are the same. And, the corresponding * GDT entries are identical. So I will do nothing about segment reg - * and GDT, but change GDT base register in prelog and epilog. + * and GDT, but change GDT base register in prolog and epilog. */ /* * 1. Now I am running with EIP = + PAGE_OFFSET. * But to make it smoothly switch from virtual mode to flat mode. - * The mapping of lower virtual memory has been created in prelog and + * The mapping of lower virtual memory has been created in prolog and * epilog. */ movl $1f, %edx -- cgit v1.2.3 From 77e21e87acec8c857df4bfbc647ea21d0a87364d Mon Sep 17 00:00:00 2001 From: Andre Müller Date: Wed, 10 Sep 2014 01:00:22 +0200 Subject: x86/efi: Adding efi_printks on memory allocationa and pci.reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All other calls to allocate memory seem to make some noise already, with the exception of two calls (for gop, uga) in the setup_graphics path. The purpose is to be noisy on worrysome errors immediately. commit fb86b2440de0 ("x86/efi: Add better error logging to EFI boot stub") introduces printing false alarms for lots of hardware. Rather than playing Whack a Mole with non-fatal exit conditions, try the other way round. This is per Matt Fleming's suggestion: > Where I think we could improve things > is by adding efi_printk() message in certain error paths. Clearly, not > all error paths need such messages, e.g. the EFI_INVALID_PARAMETER path > you highlighted above, but it makes sense for memory allocation and PCI > read failures. Link: http://article.gmane.org/gmane.linux.kernel.efi/4628 Signed-off-by: Andre Müller Cc: Ulf Winkelvos Signed-off-by: Matt Fleming --- arch/x86/boot/compressed/eboot.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index f4bdab1dbf66..ed8e06c59a4c 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -323,8 +323,10 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) size = pci->romsize + sizeof(*rom); status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to alloc mem for rom\n"); return status; + } memset(rom, 0, sizeof(*rom)); @@ -337,14 +339,18 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_VENDOR_ID, 1, &(rom->vendor)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->vendor\n"); goto free_struct; + } status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_DEVICE_ID, 1, &(rom->devid)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->devid\n"); goto free_struct; + } status = efi_early->call(pci->get_location, pci, &(rom->segment), &(rom->bus), &(rom->device), &(rom->function)); @@ -427,8 +433,10 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom) size = pci->romsize + sizeof(*rom); status = efi_call_early(allocate_pool, EFI_LOADER_DATA, size, &rom); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to alloc mem for rom\n"); return status; + } rom->data.type = SETUP_PCI; rom->data.len = size - sizeof(struct setup_data); @@ -439,14 +447,18 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom) status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_VENDOR_ID, 1, &(rom->vendor)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->vendor\n"); goto free_struct; + } status = efi_early->call(pci->pci.read, pci, EfiPciIoWidthUint16, PCI_DEVICE_ID, 1, &(rom->devid)); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to read rom->devid\n"); goto free_struct; + } status = efi_early->call(pci->get_location, pci, &(rom->segment), &(rom->bus), &(rom->device), &(rom->function)); @@ -526,8 +538,10 @@ static efi_status_t setup_efi_pci(struct boot_params *params) EFI_LOADER_DATA, size, (void **)&pci_handle); - if (status != EFI_SUCCESS) + if (status != EFI_SUCCESS) { + efi_printk(sys_table, "Failed to alloc mem for pci_handle\n"); return status; + } status = efi_call_early(locate_handle, EFI_LOCATE_BY_PROTOCOL, &pci_proto, -- cgit v1.2.3 From 6d80dba1c9fe4316ef626980102b92fa30c7845a Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 30 Sep 2014 21:58:52 +0100 Subject: efi: Provide a non-blocking SetVariable() operation There are some circumstances that call for trying to write an EFI variable in a non-blocking way. One such scenario is when writing pstore data in efi_pstore_write() via the pstore_dump() kdump callback. Now that we have an EFI runtime spinlock we need a way of aborting if there is contention instead of spinning, since when writing pstore data from the kdump callback, the runtime lock may already be held by the CPU that's running the callback if we crashed in the middle of an EFI variable operation. The situation is sufficiently special that a new EFI variable operation is warranted. Introduce ->set_variable_nonblocking() for this use case. It is an optional EFI backend operation, and need only be implemented by those backends that usually acquire locks to serialize access to EFI variables, as is the case for virt_efi_set_variable() where we now grab the EFI runtime spinlock. Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Ard Biesheuvel Cc: Matthew Garrett Signed-off-by: Matt Fleming --- drivers/firmware/efi/runtime-wrappers.c | 19 +++++++++++++ drivers/firmware/efi/vars.c | 47 +++++++++++++++++++++++++++++++++ include/linux/efi.h | 6 +++++ 3 files changed, 72 insertions(+) diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 9694cba665c4..4349206198b2 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -200,6 +200,24 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, return status; } +static efi_status_t +virt_efi_set_variable_nonblocking(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, + void *data) +{ + unsigned long flags; + efi_status_t status; + + if (!spin_trylock_irqsave(&efi_runtime_lock, flags)) + return EFI_NOT_READY; + + status = efi_call_virt(set_variable, name, vendor, attr, data_size, + data); + spin_unlock_irqrestore(&efi_runtime_lock, flags); + return status; +} + + static efi_status_t virt_efi_query_variable_info(u32 attr, u64 *storage_space, u64 *remaining_space, @@ -287,6 +305,7 @@ void efi_native_runtime_setup(void) efi.get_variable = virt_efi_get_variable; efi.get_next_variable = virt_efi_get_next_variable; efi.set_variable = virt_efi_set_variable; + efi.set_variable_nonblocking = virt_efi_set_variable_nonblocking; efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; efi.reset_system = virt_efi_reset_system; efi.query_variable_info = virt_efi_query_variable_info; diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c index 1fa724f31b0e..fa3c66bdc1e5 100644 --- a/drivers/firmware/efi/vars.c +++ b/drivers/firmware/efi/vars.c @@ -595,6 +595,39 @@ int efivar_entry_set(struct efivar_entry *entry, u32 attributes, } EXPORT_SYMBOL_GPL(efivar_entry_set); +/* + * efivar_entry_set_nonblocking - call set_variable_nonblocking() + * + * This function is guaranteed to not block and is suitable for calling + * from crash/panic handlers. + * + * Crucially, this function will not block if it cannot acquire + * __efivars->lock. Instead, it returns -EBUSY. + */ +static int +efivar_entry_set_nonblocking(efi_char16_t *name, efi_guid_t vendor, + u32 attributes, unsigned long size, void *data) +{ + const struct efivar_operations *ops = __efivars->ops; + unsigned long flags; + efi_status_t status; + + if (!spin_trylock_irqsave(&__efivars->lock, flags)) + return -EBUSY; + + status = check_var_size(attributes, size + ucs2_strsize(name, 1024)); + if (status != EFI_SUCCESS) { + spin_unlock_irqrestore(&__efivars->lock, flags); + return -ENOSPC; + } + + status = ops->set_variable_nonblocking(name, &vendor, attributes, + size, data); + + spin_unlock_irqrestore(&__efivars->lock, flags); + return efi_status_to_err(status); +} + /** * efivar_entry_set_safe - call set_variable() if enough space in firmware * @name: buffer containing the variable name @@ -622,6 +655,20 @@ int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes, if (!ops->query_variable_store) return -ENOSYS; + /* + * If the EFI variable backend provides a non-blocking + * ->set_variable() operation and we're in a context where we + * cannot block, then we need to use it to avoid live-locks, + * since the implication is that the regular ->set_variable() + * will block. + * + * If no ->set_variable_nonblocking() is provided then + * ->set_variable() is assumed to be non-blocking. + */ + if (!block && ops->set_variable_nonblocking) + return efivar_entry_set_nonblocking(name, vendor, attributes, + size, data); + if (!block) { if (!spin_trylock_irqsave(&__efivars->lock, flags)) return -EBUSY; diff --git a/include/linux/efi.h b/include/linux/efi.h index 78b29b133e14..0949f9c7e872 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -503,6 +503,10 @@ typedef efi_status_t efi_get_next_variable_t (unsigned long *name_size, efi_char typedef efi_status_t efi_set_variable_t (efi_char16_t *name, efi_guid_t *vendor, u32 attr, unsigned long data_size, void *data); +typedef efi_status_t +efi_set_variable_nonblocking_t(efi_char16_t *name, efi_guid_t *vendor, + u32 attr, unsigned long data_size, void *data); + typedef efi_status_t efi_get_next_high_mono_count_t (u32 *count); typedef void efi_reset_system_t (int reset_type, efi_status_t status, unsigned long data_size, efi_char16_t *data); @@ -822,6 +826,7 @@ extern struct efi { efi_get_variable_t *get_variable; efi_get_next_variable_t *get_next_variable; efi_set_variable_t *set_variable; + efi_set_variable_nonblocking_t *set_variable_nonblocking; efi_query_variable_info_t *query_variable_info; efi_update_capsule_t *update_capsule; efi_query_capsule_caps_t *query_capsule_caps; @@ -1042,6 +1047,7 @@ struct efivar_operations { efi_get_variable_t *get_variable; efi_get_next_variable_t *get_next_variable; efi_set_variable_t *set_variable; + efi_set_variable_nonblocking_t *set_variable_nonblocking; efi_query_variable_store_t *query_variable_store; }; -- cgit v1.2.3 From 60b4dc7720a5251f5dbda69ad404e0bcb3cb6bfb Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 30 Sep 2014 15:03:38 +0100 Subject: efi: Delete the in_nmi() conditional runtime locking commit 5dc3826d9f08 ("efi: Implement mandatory locking for UEFI Runtime Services") implemented some conditional locking when accessing variable runtime services that Ingo described as "pretty disgusting". The intention with the !efi_in_nmi() checks was to avoid live-locks when trying to write pstore crash data into an EFI variable. Such lockless accesses are allowed according to the UEFI specification when we're in a "non-recoverable" state, but whether or not things are implemented correctly in actual firmware implementations remains an unanswered question, and so it would seem sensible to avoid doing any kind of unsynchronized variable accesses. Furthermore, the efi_in_nmi() tests are inadequate because they don't account for the case where we call EFI variable services from panic or oops callbacks and aren't executing in NMI context. In other words, live-locking is still possible. Let's just remove the conditional locking altogether. Now we've got the ->set_variable_nonblocking() EFI variable operation we can abort if the runtime lock is already held. Aborting is by far the safest option. Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Ard Biesheuvel Cc: Matthew Garrett Signed-off-by: Matt Fleming --- arch/x86/include/asm/efi.h | 2 -- drivers/firmware/efi/runtime-wrappers.c | 17 ++++------------- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index abba1fe1db5e..408ad6d3222e 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -86,8 +86,6 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size, #endif /* CONFIG_X86_32 */ -#define efi_in_nmi() in_nmi() - extern struct efi_scratch efi_scratch; extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable); extern int __init efi_memblock_x86_reserve_range(void); diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c index 4349206198b2..228bbf910461 100644 --- a/drivers/firmware/efi/runtime-wrappers.c +++ b/drivers/firmware/efi/runtime-wrappers.c @@ -86,9 +86,6 @@ static DEFINE_SPINLOCK(efi_runtime_lock); * for QueryVariableInfo() and SetVariable(), as these can be reached in NMI * context through efi_pstore_write(). */ -#ifndef efi_in_nmi -#define efi_in_nmi() (0) -#endif /* * As per commit ef68c8f87ed1 ("x86: Serialize EFI time accesses on rtc_lock"), @@ -189,14 +186,11 @@ static efi_status_t virt_efi_set_variable(efi_char16_t *name, { unsigned long flags; efi_status_t status; - bool __in_nmi = efi_in_nmi(); - if (!__in_nmi) - spin_lock_irqsave(&efi_runtime_lock, flags); + spin_lock_irqsave(&efi_runtime_lock, flags); status = efi_call_virt(set_variable, name, vendor, attr, data_size, data); - if (!__in_nmi) - spin_unlock_irqrestore(&efi_runtime_lock, flags); + spin_unlock_irqrestore(&efi_runtime_lock, flags); return status; } @@ -225,17 +219,14 @@ static efi_status_t virt_efi_query_variable_info(u32 attr, { unsigned long flags; efi_status_t status; - bool __in_nmi = efi_in_nmi(); if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) return EFI_UNSUPPORTED; - if (!__in_nmi) - spin_lock_irqsave(&efi_runtime_lock, flags); + spin_lock_irqsave(&efi_runtime_lock, flags); status = efi_call_virt(query_variable_info, attr, storage_space, remaining_space, max_variable_size); - if (!__in_nmi) - spin_unlock_irqrestore(&efi_runtime_lock, flags); + spin_unlock_irqrestore(&efi_runtime_lock, flags); return status; } -- cgit v1.2.3 From 3f71f6da7791a5feae0ff07e718431d1df01273a Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 11 Sep 2014 10:20:40 +0200 Subject: efi: rtc-efi: Export platform:rtc-efi as module alias When the rtc-efi driver is built as a module, we already register the EFI rtc as a platform device if UEFI Runtime Services are enabled. To wire it up to udev, and let the module be loaded automatically, we need to export the 'platform:rtc-efi' alias from the module. Signed-off-by: Ard Biesheuvel Cc: Alessandro Zummo Cc: Mark Salter Cc: Dave Young Signed-off-by: Matt Fleming --- drivers/rtc/rtc-efi.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c index 8225b89de810..59a55e7abf78 100644 --- a/drivers/rtc/rtc-efi.c +++ b/drivers/rtc/rtc-efi.c @@ -235,3 +235,4 @@ module_platform_driver_probe(efi_rtc_driver, efi_rtc_probe); MODULE_AUTHOR("dann frazier "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("EFI RTC driver"); +MODULE_ALIAS("platform:rtc-efi"); -- cgit v1.2.3 From 7c9e7a6fe11c8dc5b3b9d0e889dde73347247584 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 1 Oct 2014 16:07:05 -0700 Subject: target: Add a user-passthrough backstore Add a LIO storage engine that presents commands to userspace for execution. This would allow more complex backstores to be implemented out-of-kernel, and also make experimentation a-la FUSE (but at the SCSI level -- "SUSE"?) possible. It uses a mmap()able UIO device per LUN to share a command ring and data area. The commands are raw SCSI CDBs and iovs for in/out data. The command ring is also reused for returning scsi command status and optional sense data. This implementation is based on Shaohua Li's earlier version but heavily modified. Differences include: * Shared memory allocated by kernel, not locked-down user pages * Single ring for command request and response * Offsets instead of embedded pointers * Generic SCSI CDB passthrough instead of per-cmd specialization in ring format. * Uses UIO device instead of anon_file passed in mailbox. * Optional in-kernel handling of some commands. The main reason for these differences is to permit greater resiliency if the user process dies or hangs. Things not yet implemented (on purpose): * Zero copy. The data area is flexible enough to allow page flipping or backend-allocated pages to be used by fabrics, but it's not clear these are performance wins. Can come later. * Out-of-order command completion by userspace. Possible to add by just allowing userspace to change cmd_id in rsp cmd entries, but currently not supported. * No locks between kernel cmd submission and completion routines. Sounds like it's possible, but this can come later. * Sparse allocation of mmaped area. Current code vmallocs the whole thing. If the mapped area was larger and not fully mapped then the driver would have more freedom to change cmd and data area sizes based on demand. Current code open issues: * The use of idrs may be overkill -- we maybe can replace them with a simple counter to generate cmd_ids, and a hash table to get a cmd_id's associated pointer. * Use of a free-running counter for cmd ring instead of explicit modulo math. This would require power-of-2 cmd ring size. (Add kconfig depends NET - Randy) Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/target/Kconfig | 7 + drivers/target/Makefile | 1 + drivers/target/target_core_transport.c | 4 + drivers/target/target_core_user.c | 1163 ++++++++++++++++++++++++++++++++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/target_core_user.h | 142 ++++ 6 files changed, 1318 insertions(+) create mode 100644 drivers/target/target_core_user.c create mode 100644 include/uapi/linux/target_core_user.h diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig index dc2d84ac5a0e..81d44c477a5b 100644 --- a/drivers/target/Kconfig +++ b/drivers/target/Kconfig @@ -31,6 +31,13 @@ config TCM_PSCSI Say Y here to enable the TCM/pSCSI subsystem plugin for non-buffered passthrough access to Linux/SCSI device +config TCM_USER + tristate "TCM/USER Subsystem Plugin for Linux" + depends on UIO && NET + help + Say Y here to enable the TCM/USER subsystem plugin for a userspace + process to handle requests + source "drivers/target/loopback/Kconfig" source "drivers/target/tcm_fc/Kconfig" source "drivers/target/iscsi/Kconfig" diff --git a/drivers/target/Makefile b/drivers/target/Makefile index 85b012d2f89b..bbb4a7d638ef 100644 --- a/drivers/target/Makefile +++ b/drivers/target/Makefile @@ -22,6 +22,7 @@ obj-$(CONFIG_TARGET_CORE) += target_core_mod.o obj-$(CONFIG_TCM_IBLOCK) += target_core_iblock.o obj-$(CONFIG_TCM_FILEIO) += target_core_file.o obj-$(CONFIG_TCM_PSCSI) += target_core_pscsi.o +obj-$(CONFIG_TCM_USER) += target_core_user.o # Fabric modules obj-$(CONFIG_LOOPBACK_TARGET) += loopback/ diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 9700ea125268..9ea0d5f03f7a 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -232,6 +232,10 @@ void transport_subsystem_check_init(void) if (ret != 0) pr_err("Unable to load target_core_pscsi\n"); + ret = request_module("target_core_user"); + if (ret != 0) + pr_err("Unable to load target_core_user\n"); + sub_api_initialized = 1; } diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c new file mode 100644 index 000000000000..6608ecf94570 --- /dev/null +++ b/drivers/target/target_core_user.c @@ -0,0 +1,1163 @@ +/* + * Copyright (C) 2013 Shaohua Li + * Copyright (C) 2014 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Define a shared-memory interface for LIO to pass SCSI commands and + * data to userspace for processing. This is to allow backends that + * are too complex for in-kernel support to be possible. + * + * It uses the UIO framework to do a lot of the device-creation and + * introspection work for us. + * + * See the .h file for how the ring is laid out. Note that while the + * command ring is defined, the particulars of the data area are + * not. Offset values in the command entry point to other locations + * internal to the mmap()ed area. There is separate space outside the + * command ring for data buffers. This leaves maximum flexibility for + * moving buffer allocations, or even page flipping or other + * allocation techniques, without altering the command ring layout. + * + * SECURITY: + * The user process must be assumed to be malicious. There's no way to + * prevent it breaking the command ring protocol if it wants, but in + * order to prevent other issues we must only ever read *data* from + * the shared memory area, not offsets or sizes. This applies to + * command ring entries as well as the mailbox. Extra code needed for + * this may have a 'UAM' comment. + */ + + +#define TCMU_TIME_OUT (30 * MSEC_PER_SEC) + +#define CMDR_SIZE (16 * 4096) +#define DATA_SIZE (257 * 4096) + +#define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE) + +static struct device *tcmu_root_device; + +struct tcmu_hba { + u32 host_id; +}; + +/* User wants all cmds or just some */ +enum passthru_level { + TCMU_PASS_ALL = 0, + TCMU_PASS_IO, + TCMU_PASS_INVALID, +}; + +#define TCMU_CONFIG_LEN 256 + +struct tcmu_dev { + struct se_device se_dev; + + char *name; + struct se_hba *hba; + +#define TCMU_DEV_BIT_OPEN 0 +#define TCMU_DEV_BIT_BROKEN 1 + unsigned long flags; + enum passthru_level pass_level; + + struct uio_info uio_info; + + struct tcmu_mailbox *mb_addr; + size_t dev_size; + u32 cmdr_size; + u32 cmdr_last_cleaned; + /* Offset of data ring from start of mb */ + size_t data_off; + size_t data_size; + /* Ring head + tail values. */ + /* Must add data_off and mb_addr to get the address */ + size_t data_head; + size_t data_tail; + + wait_queue_head_t wait_cmdr; + /* TODO should this be a mutex? */ + spinlock_t cmdr_lock; + + struct idr commands; + spinlock_t commands_lock; + + struct timer_list timeout; + + char dev_config[TCMU_CONFIG_LEN]; +}; + +#define TCMU_DEV(_se_dev) container_of(_se_dev, struct tcmu_dev, se_dev) + +#define CMDR_OFF sizeof(struct tcmu_mailbox) + +struct tcmu_cmd { + struct se_cmd *se_cmd; + struct tcmu_dev *tcmu_dev; + + uint16_t cmd_id; + + /* Can't use se_cmd->data_length when cleaning up expired cmds, because if + cmd has been completed then accessing se_cmd is off limits */ + size_t data_length; + + unsigned long deadline; + +#define TCMU_CMD_BIT_EXPIRED 0 + unsigned long flags; +}; + +static struct kmem_cache *tcmu_cmd_cache; + +/* multicast group */ +enum tcmu_multicast_groups { + TCMU_MCGRP_CONFIG, +}; + +static const struct genl_multicast_group tcmu_mcgrps[] = { + [TCMU_MCGRP_CONFIG] = { .name = "config", }, +}; + +/* Our generic netlink family */ +static struct genl_family tcmu_genl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = 0, + .name = "TCM-USER", + .version = 1, + .maxattr = TCMU_ATTR_MAX, + .mcgrps = tcmu_mcgrps, + .n_mcgrps = ARRAY_SIZE(tcmu_mcgrps), +}; + +static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) +{ + struct se_device *se_dev = se_cmd->se_dev; + struct tcmu_dev *udev = TCMU_DEV(se_dev); + struct tcmu_cmd *tcmu_cmd; + int cmd_id; + + tcmu_cmd = kmem_cache_zalloc(tcmu_cmd_cache, GFP_KERNEL); + if (!tcmu_cmd) + return NULL; + + tcmu_cmd->se_cmd = se_cmd; + tcmu_cmd->tcmu_dev = udev; + tcmu_cmd->data_length = se_cmd->data_length; + + tcmu_cmd->deadline = jiffies + msecs_to_jiffies(TCMU_TIME_OUT); + + idr_preload(GFP_KERNEL); + spin_lock_irq(&udev->commands_lock); + cmd_id = idr_alloc(&udev->commands, tcmu_cmd, 0, + USHRT_MAX, GFP_NOWAIT); + spin_unlock_irq(&udev->commands_lock); + idr_preload_end(); + + if (cmd_id < 0) { + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + return NULL; + } + tcmu_cmd->cmd_id = cmd_id; + + return tcmu_cmd; +} + +static inline void tcmu_flush_dcache_range(void *vaddr, size_t size) +{ + unsigned long offset = (unsigned long) vaddr & ~PAGE_MASK; + + size = round_up(size+offset, PAGE_SIZE); + vaddr -= offset; + + while (size) { + flush_dcache_page(virt_to_page(vaddr)); + size -= PAGE_SIZE; + } +} + +/* + * Some ring helper functions. We don't assume size is a power of 2 so + * we can't use circ_buf.h. + */ +static inline size_t spc_used(size_t head, size_t tail, size_t size) +{ + int diff = head - tail; + + if (diff >= 0) + return diff; + else + return size + diff; +} + +static inline size_t spc_free(size_t head, size_t tail, size_t size) +{ + /* Keep 1 byte unused or we can't tell full from empty */ + return (size - spc_used(head, tail, size) - 1); +} + +static inline size_t head_to_end(size_t head, size_t size) +{ + return size - head; +} + +#define UPDATE_HEAD(head, used, size) smp_store_release(&head, ((head % size) + used) % size) + +/* + * We can't queue a command until we have space available on the cmd ring *and* space + * space avail on the data ring. + * + * Called with ring lock held. + */ +static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_needed, size_t data_needed) +{ + struct tcmu_mailbox *mb = udev->mb_addr; + size_t space; + u32 cmd_head; + + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + + space = spc_free(cmd_head, udev->cmdr_last_cleaned, udev->cmdr_size); + if (space < cmd_needed) { + pr_debug("no cmd space: %u %u %u\n", cmd_head, + udev->cmdr_last_cleaned, udev->cmdr_size); + return false; + } + + space = spc_free(udev->data_head, udev->data_tail, udev->data_size); + if (space < data_needed) { + pr_debug("no data space: %zu %zu %zu\n", udev->data_head, + udev->data_tail, udev->data_size); + return false; + } + + return true; +} + +static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + size_t base_command_size, command_size; + size_t cmdr_space_needed; + struct tcmu_mailbox *mb; + size_t pad_size; + struct tcmu_cmd_entry *entry; + int i; + struct scatterlist *sg; + struct iovec *iov; + int iov_cnt = 0; + uint32_t cmd_head; + uint64_t cdb_off; + + if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) + return -EINVAL; + + /* + * Must be a certain minimum size for response sense info, but + * also may be larger if the iov array is large. + * + * iovs = sgl_nents+1, for end-of-ring case, plus another 1 + * b/c size == offsetof one-past-element. + */ + base_command_size = max(offsetof(struct tcmu_cmd_entry, + req.iov[se_cmd->t_data_nents + 2]), + sizeof(struct tcmu_cmd_entry)); + command_size = base_command_size + + round_up(scsi_command_size(se_cmd->t_task_cdb), TCMU_OP_ALIGN_SIZE); + + WARN_ON(command_size & (TCMU_OP_ALIGN_SIZE-1)); + + spin_lock_irq(&udev->cmdr_lock); + + mb = udev->mb_addr; + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + if ((command_size > (udev->cmdr_size / 2)) + || tcmu_cmd->data_length > (udev->data_size - 1)) + pr_warn("TCMU: Request of size %zu/%zu may be too big for %u/%zu " + "cmd/data ring buffers\n", command_size, tcmu_cmd->data_length, + udev->cmdr_size, udev->data_size); + + /* + * Cmd end-of-ring space is too small so we need space for a NOP plus + * original cmd - cmds are internally contiguous. + */ + if (head_to_end(cmd_head, udev->cmdr_size) >= command_size) + pad_size = 0; + else + pad_size = head_to_end(cmd_head, udev->cmdr_size); + cmdr_space_needed = command_size + pad_size; + + while (!is_ring_space_avail(udev, cmdr_space_needed, tcmu_cmd->data_length)) { + int ret; + DEFINE_WAIT(__wait); + + prepare_to_wait(&udev->wait_cmdr, &__wait, TASK_INTERRUPTIBLE); + + pr_debug("sleeping for ring space\n"); + spin_unlock_irq(&udev->cmdr_lock); + ret = schedule_timeout(msecs_to_jiffies(TCMU_TIME_OUT)); + finish_wait(&udev->wait_cmdr, &__wait); + if (!ret) { + pr_warn("tcmu: command timed out\n"); + return -ETIMEDOUT; + } + + spin_lock_irq(&udev->cmdr_lock); + + /* We dropped cmdr_lock, cmd_head is stale */ + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + } + + if (pad_size) { + entry = (void *) mb + CMDR_OFF + cmd_head; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + tcmu_hdr_set_op(&entry->hdr, TCMU_OP_PAD); + tcmu_hdr_set_len(&entry->hdr, pad_size); + + UPDATE_HEAD(mb->cmd_head, pad_size, udev->cmdr_size); + + cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + WARN_ON(cmd_head != 0); + } + + entry = (void *) mb + CMDR_OFF + cmd_head; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + tcmu_hdr_set_op(&entry->hdr, TCMU_OP_CMD); + tcmu_hdr_set_len(&entry->hdr, command_size); + entry->cmd_id = tcmu_cmd->cmd_id; + + /* + * Fix up iovecs, and handle if allocation in data ring wrapped. + */ + iov = &entry->req.iov[0]; + for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) { + size_t copy_bytes = min((size_t)sg->length, + head_to_end(udev->data_head, udev->data_size)); + void *from = kmap_atomic(sg_page(sg)) + sg->offset; + void *to = (void *) mb + udev->data_off + udev->data_head; + + if (tcmu_cmd->se_cmd->data_direction == DMA_TO_DEVICE) { + memcpy(to, from, copy_bytes); + tcmu_flush_dcache_range(to, copy_bytes); + } + + /* Even iov_base is relative to mb_addr */ + iov->iov_len = copy_bytes; + iov->iov_base = (void *) udev->data_off + udev->data_head; + iov_cnt++; + iov++; + + UPDATE_HEAD(udev->data_head, copy_bytes, udev->data_size); + + /* Uh oh, we wrapped the buffer. Must split sg across 2 iovs. */ + if (sg->length != copy_bytes) { + from += copy_bytes; + copy_bytes = sg->length - copy_bytes; + + iov->iov_len = copy_bytes; + iov->iov_base = (void *) udev->data_off + udev->data_head; + + if (se_cmd->data_direction == DMA_TO_DEVICE) { + to = (void *) mb + udev->data_off + udev->data_head; + memcpy(to, from, copy_bytes); + tcmu_flush_dcache_range(to, copy_bytes); + } + + iov_cnt++; + iov++; + + UPDATE_HEAD(udev->data_head, copy_bytes, udev->data_size); + } + + kunmap_atomic(from); + } + entry->req.iov_cnt = iov_cnt; + + /* All offsets relative to mb_addr, not start of entry! */ + cdb_off = CMDR_OFF + cmd_head + base_command_size; + memcpy((void *) mb + cdb_off, se_cmd->t_task_cdb, scsi_command_size(se_cmd->t_task_cdb)); + entry->req.cdb_off = cdb_off; + tcmu_flush_dcache_range(entry, sizeof(*entry)); + + UPDATE_HEAD(mb->cmd_head, command_size, udev->cmdr_size); + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + spin_unlock_irq(&udev->cmdr_lock); + + /* TODO: only if FLUSH and FUA? */ + uio_event_notify(&udev->uio_info); + + mod_timer(&udev->timeout, + round_jiffies_up(jiffies + msecs_to_jiffies(TCMU_TIME_OUT))); + + return 0; +} + +static int tcmu_queue_cmd(struct se_cmd *se_cmd) +{ + struct se_device *se_dev = se_cmd->se_dev; + struct tcmu_dev *udev = TCMU_DEV(se_dev); + struct tcmu_cmd *tcmu_cmd; + int ret; + + tcmu_cmd = tcmu_alloc_cmd(se_cmd); + if (!tcmu_cmd) + return -ENOMEM; + + ret = tcmu_queue_cmd_ring(tcmu_cmd); + if (ret < 0) { + pr_err("TCMU: Could not queue command\n"); + spin_lock_irq(&udev->commands_lock); + idr_remove(&udev->commands, tcmu_cmd->cmd_id); + spin_unlock_irq(&udev->commands_lock); + + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + } + + return ret; +} + +static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *entry) +{ + struct se_cmd *se_cmd = cmd->se_cmd; + struct tcmu_dev *udev = cmd->tcmu_dev; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) { + /* cmd has been completed already from timeout, just reclaim data + ring space */ + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + return; + } + + if (entry->rsp.scsi_status == SAM_STAT_CHECK_CONDITION) { + memcpy(se_cmd->sense_buffer, entry->rsp.sense_buffer, + se_cmd->scsi_sense_length); + + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + } + else if (se_cmd->data_direction == DMA_FROM_DEVICE) { + struct scatterlist *sg; + int i; + + /* It'd be easier to look at entry's iovec again, but UAM */ + for_each_sg(se_cmd->t_data_sg, sg, se_cmd->t_data_nents, i) { + size_t copy_bytes; + void *to; + void *from; + + copy_bytes = min((size_t)sg->length, + head_to_end(udev->data_tail, udev->data_size)); + + to = kmap_atomic(sg_page(sg)) + sg->offset; + WARN_ON(sg->length + sg->offset > PAGE_SIZE); + from = (void *) udev->mb_addr + udev->data_off + udev->data_tail; + tcmu_flush_dcache_range(from, copy_bytes); + memcpy(to, from, copy_bytes); + + UPDATE_HEAD(udev->data_tail, copy_bytes, udev->data_size); + + /* Uh oh, wrapped the data buffer for this sg's data */ + if (sg->length != copy_bytes) { + from = (void *) udev->mb_addr + udev->data_off + udev->data_tail; + WARN_ON(udev->data_tail); + to += copy_bytes; + copy_bytes = sg->length - copy_bytes; + tcmu_flush_dcache_range(from, copy_bytes); + memcpy(to, from, copy_bytes); + + UPDATE_HEAD(udev->data_tail, copy_bytes, udev->data_size); + } + + kunmap_atomic(to); + } + + } else if (se_cmd->data_direction == DMA_TO_DEVICE) { + UPDATE_HEAD(udev->data_tail, cmd->data_length, udev->data_size); + } else { + pr_warn("TCMU: data direction was %d!\n", se_cmd->data_direction); + } + + target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status); + cmd->se_cmd = NULL; + + kmem_cache_free(tcmu_cmd_cache, cmd); +} + +static unsigned int tcmu_handle_completions(struct tcmu_dev *udev) +{ + struct tcmu_mailbox *mb; + LIST_HEAD(cpl_cmds); + unsigned long flags; + int handled = 0; + + if (test_bit(TCMU_DEV_BIT_BROKEN, &udev->flags)) { + pr_err("ring broken, not handling completions\n"); + return 0; + } + + spin_lock_irqsave(&udev->cmdr_lock, flags); + + mb = udev->mb_addr; + tcmu_flush_dcache_range(mb, sizeof(*mb)); + + while (udev->cmdr_last_cleaned != ACCESS_ONCE(mb->cmd_tail)) { + + struct tcmu_cmd_entry *entry = (void *) mb + CMDR_OFF + udev->cmdr_last_cleaned; + struct tcmu_cmd *cmd; + + tcmu_flush_dcache_range(entry, sizeof(*entry)); + + if (tcmu_hdr_get_op(&entry->hdr) == TCMU_OP_PAD) { + UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size); + continue; + } + WARN_ON(tcmu_hdr_get_op(&entry->hdr) != TCMU_OP_CMD); + + spin_lock(&udev->commands_lock); + cmd = idr_find(&udev->commands, entry->cmd_id); + if (cmd) + idr_remove(&udev->commands, cmd->cmd_id); + spin_unlock(&udev->commands_lock); + + if (!cmd) { + pr_err("cmd_id not found, ring is broken\n"); + set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags); + break; + } + + tcmu_handle_completion(cmd, entry); + + UPDATE_HEAD(udev->cmdr_last_cleaned, tcmu_hdr_get_len(&entry->hdr), udev->cmdr_size); + + handled++; + } + + if (mb->cmd_tail == mb->cmd_head) + del_timer(&udev->timeout); /* no more pending cmds */ + + spin_unlock_irqrestore(&udev->cmdr_lock, flags); + + wake_up(&udev->wait_cmdr); + + return handled; +} + +static int tcmu_check_expired_cmd(int id, void *p, void *data) +{ + struct tcmu_cmd *cmd = p; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) + return 0; + + if (!time_after(cmd->deadline, jiffies)) + return 0; + + set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags); + target_complete_cmd(cmd->se_cmd, SAM_STAT_CHECK_CONDITION); + cmd->se_cmd = NULL; + + kmem_cache_free(tcmu_cmd_cache, cmd); + + return 0; +} + +static void tcmu_device_timedout(unsigned long data) +{ + struct tcmu_dev *udev = (struct tcmu_dev *)data; + unsigned long flags; + int handled; + + handled = tcmu_handle_completions(udev); + + pr_warn("%d completions handled from timeout\n", handled); + + spin_lock_irqsave(&udev->commands_lock, flags); + idr_for_each(&udev->commands, tcmu_check_expired_cmd, NULL); + spin_unlock_irqrestore(&udev->commands_lock, flags); + + /* + * We don't need to wakeup threads on wait_cmdr since they have their + * own timeout. + */ +} + +static int tcmu_attach_hba(struct se_hba *hba, u32 host_id) +{ + struct tcmu_hba *tcmu_hba; + + tcmu_hba = kzalloc(sizeof(struct tcmu_hba), GFP_KERNEL); + if (!tcmu_hba) + return -ENOMEM; + + tcmu_hba->host_id = host_id; + hba->hba_ptr = tcmu_hba; + + return 0; +} + +static void tcmu_detach_hba(struct se_hba *hba) +{ + kfree(hba->hba_ptr); + hba->hba_ptr = NULL; +} + +static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) +{ + struct tcmu_dev *udev; + + udev = kzalloc(sizeof(struct tcmu_dev), GFP_KERNEL); + if (!udev) + return NULL; + + udev->name = kstrdup(name, GFP_KERNEL); + if (!udev->name) { + kfree(udev); + return NULL; + } + + udev->hba = hba; + + init_waitqueue_head(&udev->wait_cmdr); + spin_lock_init(&udev->cmdr_lock); + + idr_init(&udev->commands); + spin_lock_init(&udev->commands_lock); + + setup_timer(&udev->timeout, tcmu_device_timedout, + (unsigned long)udev); + + udev->pass_level = TCMU_PASS_ALL; + + return &udev->se_dev; +} + +static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on) +{ + struct tcmu_dev *tcmu_dev = container_of(info, struct tcmu_dev, uio_info); + + tcmu_handle_completions(tcmu_dev); + + return 0; +} + +/* + * mmap code from uio.c. Copied here because we want to hook mmap() + * and this stuff must come along. + */ +static int tcmu_find_mem_index(struct vm_area_struct *vma) +{ + struct tcmu_dev *udev = vma->vm_private_data; + struct uio_info *info = &udev->uio_info; + + if (vma->vm_pgoff < MAX_UIO_MAPS) { + if (info->mem[vma->vm_pgoff].size == 0) + return -1; + return (int)vma->vm_pgoff; + } + return -1; +} + +static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct tcmu_dev *udev = vma->vm_private_data; + struct uio_info *info = &udev->uio_info; + struct page *page; + unsigned long offset; + void *addr; + + int mi = tcmu_find_mem_index(vma); + if (mi < 0) + return VM_FAULT_SIGBUS; + + /* + * We need to subtract mi because userspace uses offset = N*PAGE_SIZE + * to use mem[N]. + */ + offset = (vmf->pgoff - mi) << PAGE_SHIFT; + + addr = (void *)(unsigned long)info->mem[mi].addr + offset; + if (info->mem[mi].memtype == UIO_MEM_LOGICAL) + page = virt_to_page(addr); + else + page = vmalloc_to_page(addr); + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct tcmu_vm_ops = { + .fault = tcmu_vma_fault, +}; + +static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_ops = &tcmu_vm_ops; + + vma->vm_private_data = udev; + + /* Ensure the mmap is exactly the right size */ + if (vma_pages(vma) != (TCMU_RING_SIZE >> PAGE_SHIFT)) + return -EINVAL; + + return 0; +} + +static int tcmu_open(struct uio_info *info, struct inode *inode) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + /* O_EXCL not supported for char devs, so fake it? */ + if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags)) + return -EBUSY; + + pr_debug("open\n"); + + return 0; +} + +static int tcmu_release(struct uio_info *info, struct inode *inode) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + + clear_bit(TCMU_DEV_BIT_OPEN, &udev->flags); + + pr_debug("close\n"); + + return 0; +} + +static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int minor) +{ + struct sk_buff *skb; + void *msg_header; + int ret; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + msg_header = genlmsg_put(skb, 0, 0, &tcmu_genl_family, 0, cmd); + if (!msg_header) { + nlmsg_free(skb); + return -ENOMEM; + } + + ret = nla_put_string(skb, TCMU_ATTR_DEVICE, name); + + ret = nla_put_u32(skb, TCMU_ATTR_MINOR, minor); + + ret = genlmsg_end(skb, msg_header); + if (ret < 0) { + nlmsg_free(skb); + return ret; + } + + ret = genlmsg_multicast(&tcmu_genl_family, skb, 0, + TCMU_MCGRP_CONFIG, GFP_KERNEL); + + /* We don't care if no one is listening */ + if (ret == -ESRCH) + ret = 0; + + return ret; +} + +static int tcmu_configure_device(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + struct tcmu_hba *hba = udev->hba->hba_ptr; + struct uio_info *info; + struct tcmu_mailbox *mb; + size_t size; + size_t used; + int ret = 0; + char *str; + + info = &udev->uio_info; + + size = snprintf(NULL, 0, "tcm-user/%u/%s/%s", hba->host_id, udev->name, + udev->dev_config); + size += 1; /* for \0 */ + str = kmalloc(size, GFP_KERNEL); + if (!str) + return -ENOMEM; + + used = snprintf(str, size, "tcm-user/%u/%s", hba->host_id, udev->name); + + if (udev->dev_config[0]) + snprintf(str + used, size - used, "/%s", udev->dev_config); + + info->name = str; + + udev->mb_addr = vzalloc(TCMU_RING_SIZE); + if (!udev->mb_addr) { + ret = -ENOMEM; + goto err_vzalloc; + } + + /* mailbox fits in first part of CMDR space */ + udev->cmdr_size = CMDR_SIZE - CMDR_OFF; + udev->data_off = CMDR_SIZE; + udev->data_size = TCMU_RING_SIZE - CMDR_SIZE; + + mb = udev->mb_addr; + mb->version = 1; + mb->cmdr_off = CMDR_OFF; + mb->cmdr_size = udev->cmdr_size; + + WARN_ON(!PAGE_ALIGNED(udev->data_off)); + WARN_ON(udev->data_size % PAGE_SIZE); + + info->version = "1"; + + info->mem[0].name = "tcm-user command & data buffer"; + info->mem[0].addr = (phys_addr_t) udev->mb_addr; + info->mem[0].size = TCMU_RING_SIZE; + info->mem[0].memtype = UIO_MEM_VIRTUAL; + + info->irqcontrol = tcmu_irqcontrol; + info->irq = UIO_IRQ_CUSTOM; + + info->mmap = tcmu_mmap; + info->open = tcmu_open; + info->release = tcmu_release; + + ret = uio_register_device(tcmu_root_device, info); + if (ret) + goto err_register; + + /* Other attributes can be configured in userspace */ + dev->dev_attrib.hw_block_size = 512; + dev->dev_attrib.hw_max_sectors = 128; + dev->dev_attrib.hw_queue_depth = 128; + + ret = tcmu_netlink_event(TCMU_CMD_ADDED_DEVICE, udev->uio_info.name, + udev->uio_info.uio_dev->minor); + if (ret) + goto err_netlink; + + return 0; + +err_netlink: + uio_unregister_device(&udev->uio_info); +err_register: + vfree(udev->mb_addr); +err_vzalloc: + kfree(info->name); + + return ret; +} + +static int tcmu_check_pending_cmd(int id, void *p, void *data) +{ + struct tcmu_cmd *cmd = p; + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags)) + return 0; + return -EINVAL; +} + +static void tcmu_free_device(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + int i; + + del_timer_sync(&udev->timeout); + + vfree(udev->mb_addr); + + /* Upper layer should drain all requests before calling this */ + spin_lock_irq(&udev->commands_lock); + i = idr_for_each(&udev->commands, tcmu_check_pending_cmd, NULL); + idr_destroy(&udev->commands); + spin_unlock_irq(&udev->commands_lock); + WARN_ON(i); + + /* Device was configured */ + if (udev->uio_info.uio_dev) { + tcmu_netlink_event(TCMU_CMD_REMOVED_DEVICE, udev->uio_info.name, + udev->uio_info.uio_dev->minor); + + uio_unregister_device(&udev->uio_info); + kfree(udev->uio_info.name); + kfree(udev->name); + } + + kfree(udev); +} + +enum { + Opt_dev_config, Opt_dev_size, Opt_err, Opt_pass_level, +}; + +static match_table_t tokens = { + {Opt_dev_config, "dev_config=%s"}, + {Opt_dev_size, "dev_size=%u"}, + {Opt_pass_level, "pass_level=%u"}, + {Opt_err, NULL} +}; + +static ssize_t tcmu_set_configfs_dev_params(struct se_device *dev, + const char *page, ssize_t count) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + char *orig, *ptr, *opts, *arg_p; + substring_t args[MAX_OPT_ARGS]; + int ret = 0, token; + int arg; + + opts = kstrdup(page, GFP_KERNEL); + if (!opts) + return -ENOMEM; + + orig = opts; + + while ((ptr = strsep(&opts, ",\n")) != NULL) { + if (!*ptr) + continue; + + token = match_token(ptr, tokens, args); + switch (token) { + case Opt_dev_config: + if (match_strlcpy(udev->dev_config, &args[0], + TCMU_CONFIG_LEN) == 0) { + ret = -EINVAL; + break; + } + pr_debug("TCMU: Referencing Path: %s\n", udev->dev_config); + break; + case Opt_dev_size: + arg_p = match_strdup(&args[0]); + if (!arg_p) { + ret = -ENOMEM; + break; + } + ret = kstrtoul(arg_p, 0, (unsigned long *) &udev->dev_size); + kfree(arg_p); + if (ret < 0) + pr_err("kstrtoul() failed for dev_size=\n"); + break; + case Opt_pass_level: + match_int(args, &arg); + if (arg >= TCMU_PASS_INVALID) { + pr_warn("TCMU: Invalid pass_level: %d\n", arg); + break; + } + + pr_debug("TCMU: Setting pass_level to %d\n", arg); + udev->pass_level = arg; + break; + default: + break; + } + } + + kfree(orig); + return (!ret) ? count : ret; +} + +static ssize_t tcmu_show_configfs_dev_params(struct se_device *dev, char *b) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + ssize_t bl = 0; + + bl = sprintf(b + bl, "Config: %s ", + udev->dev_config[0] ? udev->dev_config : "NULL"); + bl += sprintf(b + bl, "Size: %zu PassLevel: %u\n", + udev->dev_size, udev->pass_level); + + return bl; +} + +static sector_t tcmu_get_blocks(struct se_device *dev) +{ + struct tcmu_dev *udev = TCMU_DEV(dev); + + return div_u64(udev->dev_size - dev->dev_attrib.block_size, + dev->dev_attrib.block_size); +} + +static sense_reason_t +tcmu_execute_rw(struct se_cmd *se_cmd, struct scatterlist *sgl, u32 sgl_nents, + enum dma_data_direction data_direction) +{ + int ret; + + ret = tcmu_queue_cmd(se_cmd); + + if (ret != 0) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + else + return TCM_NO_SENSE; +} + +static sense_reason_t +tcmu_pass_op(struct se_cmd *se_cmd) +{ + int ret = tcmu_queue_cmd(se_cmd); + + if (ret != 0) + return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; + else + return TCM_NO_SENSE; +} + +static struct sbc_ops tcmu_sbc_ops = { + .execute_rw = tcmu_execute_rw, + .execute_sync_cache = tcmu_pass_op, + .execute_write_same = tcmu_pass_op, + .execute_write_same_unmap = tcmu_pass_op, + .execute_unmap = tcmu_pass_op, +}; + +static sense_reason_t +tcmu_parse_cdb(struct se_cmd *cmd) +{ + unsigned char *cdb = cmd->t_task_cdb; + struct tcmu_dev *udev = TCMU_DEV(cmd->se_dev); + sense_reason_t ret; + + switch (udev->pass_level) { + case TCMU_PASS_ALL: + /* We're just like pscsi, then */ + /* + * For REPORT LUNS we always need to emulate the response, for everything + * else, pass it up. + */ + switch (cdb[0]) { + case REPORT_LUNS: + cmd->execute_cmd = spc_emulate_report_luns; + break; + case READ_6: + case READ_10: + case READ_12: + case READ_16: + case WRITE_6: + case WRITE_10: + case WRITE_12: + case WRITE_16: + case WRITE_VERIFY: + cmd->se_cmd_flags |= SCF_SCSI_DATA_CDB; + /* FALLTHROUGH */ + default: + cmd->execute_cmd = tcmu_pass_op; + } + ret = TCM_NO_SENSE; + break; + case TCMU_PASS_IO: + ret = sbc_parse_cdb(cmd, &tcmu_sbc_ops); + break; + default: + pr_err("Unknown tcm-user pass level %d\n", udev->pass_level); + ret = TCM_CHECK_CONDITION_ABORT_CMD; + } + + return ret; +} + +static struct se_subsystem_api tcmu_template = { + .name = "user", + .inquiry_prod = "USER", + .inquiry_rev = TCMU_VERSION, + .owner = THIS_MODULE, + .transport_type = TRANSPORT_PLUGIN_VHBA_PDEV, + .attach_hba = tcmu_attach_hba, + .detach_hba = tcmu_detach_hba, + .alloc_device = tcmu_alloc_device, + .configure_device = tcmu_configure_device, + .free_device = tcmu_free_device, + .parse_cdb = tcmu_parse_cdb, + .set_configfs_dev_params = tcmu_set_configfs_dev_params, + .show_configfs_dev_params = tcmu_show_configfs_dev_params, + .get_device_type = sbc_get_device_type, + .get_blocks = tcmu_get_blocks, +}; + +static int __init tcmu_module_init(void) +{ + int ret; + + BUILD_BUG_ON((sizeof(struct tcmu_cmd_entry) % TCMU_OP_ALIGN_SIZE) != 0); + + tcmu_cmd_cache = kmem_cache_create("tcmu_cmd_cache", + sizeof(struct tcmu_cmd), + __alignof__(struct tcmu_cmd), + 0, NULL); + if (!tcmu_cmd_cache) + return -ENOMEM; + + tcmu_root_device = root_device_register("tcm_user"); + if (IS_ERR(tcmu_root_device)) { + ret = PTR_ERR(tcmu_root_device); + goto out_free_cache; + } + + ret = genl_register_family(&tcmu_genl_family); + if (ret < 0) { + goto out_unreg_device; + } + + ret = transport_subsystem_register(&tcmu_template); + if (ret) + goto out_unreg_genl; + + return 0; + +out_unreg_genl: + genl_unregister_family(&tcmu_genl_family); +out_unreg_device: + root_device_unregister(tcmu_root_device); +out_free_cache: + kmem_cache_destroy(tcmu_cmd_cache); + + return ret; +} + +static void __exit tcmu_module_exit(void) +{ + transport_subsystem_release(&tcmu_template); + genl_unregister_family(&tcmu_genl_family); + root_device_unregister(tcmu_root_device); + kmem_cache_destroy(tcmu_cmd_cache); +} + +MODULE_DESCRIPTION("TCM USER subsystem plugin"); +MODULE_AUTHOR("Shaohua Li "); +MODULE_AUTHOR("Andy Grover "); +MODULE_LICENSE("GPL"); + +module_init(tcmu_module_init); +module_exit(tcmu_module_exit); diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index be88166349a1..6ebd0d1faf2e 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -371,6 +371,7 @@ header-y += swab.h header-y += synclink.h header-y += sysctl.h header-y += sysinfo.h +header-y += target_core_user.h header-y += taskstats.h header-y += tcp.h header-y += tcp_metrics.h diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h new file mode 100644 index 000000000000..7dcfbe6771b1 --- /dev/null +++ b/include/uapi/linux/target_core_user.h @@ -0,0 +1,142 @@ +#ifndef __TARGET_CORE_USER_H +#define __TARGET_CORE_USER_H + +/* This header will be used by application too */ + +#include +#include + +#ifndef __packed +#define __packed __attribute__((packed)) +#endif + +#define TCMU_VERSION "1.0" + +/* + * Ring Design + * ----------- + * + * The mmaped area is divided into three parts: + * 1) The mailbox (struct tcmu_mailbox, below) + * 2) The command ring + * 3) Everything beyond the command ring (data) + * + * The mailbox tells userspace the offset of the command ring from the + * start of the shared memory region, and how big the command ring is. + * + * The kernel passes SCSI commands to userspace by putting a struct + * tcmu_cmd_entry in the ring, updating mailbox->cmd_head, and poking + * userspace via uio's interrupt mechanism. + * + * tcmu_cmd_entry contains a header. If the header type is PAD, + * userspace should skip hdr->length bytes (mod cmdr_size) to find the + * next cmd_entry. + * + * Otherwise, the entry will contain offsets into the mmaped area that + * contain the cdb and data buffers -- the latter accessible via the + * iov array. iov addresses are also offsets into the shared area. + * + * When userspace is completed handling the command, set + * entry->rsp.scsi_status, fill in rsp.sense_buffer if appropriate, + * and also set mailbox->cmd_tail equal to the old cmd_tail plus + * hdr->length, mod cmdr_size. If cmd_tail doesn't equal cmd_head, it + * should process the next packet the same way, and so on. + */ + +#define TCMU_MAILBOX_VERSION 1 +#define ALIGN_SIZE 64 /* Should be enough for most CPUs */ + +struct tcmu_mailbox { + __u16 version; + __u16 flags; + __u32 cmdr_off; + __u32 cmdr_size; + + __u32 cmd_head; + + /* Updated by user. On its own cacheline */ + __u32 cmd_tail __attribute__((__aligned__(ALIGN_SIZE))); + +} __packed; + +enum tcmu_opcode { + TCMU_OP_PAD = 0, + TCMU_OP_CMD, +}; + +/* + * Only a few opcodes, and length is 8-byte aligned, so use low bits for opcode. + */ +struct tcmu_cmd_entry_hdr { + __u32 len_op; +} __packed; + +#define TCMU_OP_MASK 0x7 + +static inline enum tcmu_opcode tcmu_hdr_get_op(struct tcmu_cmd_entry_hdr *hdr) +{ + return hdr->len_op & TCMU_OP_MASK; +} + +static inline void tcmu_hdr_set_op(struct tcmu_cmd_entry_hdr *hdr, enum tcmu_opcode op) +{ + hdr->len_op &= ~TCMU_OP_MASK; + hdr->len_op |= (op & TCMU_OP_MASK); +} + +static inline __u32 tcmu_hdr_get_len(struct tcmu_cmd_entry_hdr *hdr) +{ + return hdr->len_op & ~TCMU_OP_MASK; +} + +static inline void tcmu_hdr_set_len(struct tcmu_cmd_entry_hdr *hdr, __u32 len) +{ + hdr->len_op &= TCMU_OP_MASK; + hdr->len_op |= len; +} + +/* Currently the same as SCSI_SENSE_BUFFERSIZE */ +#define TCMU_SENSE_BUFFERSIZE 96 + +struct tcmu_cmd_entry { + struct tcmu_cmd_entry_hdr hdr; + + uint16_t cmd_id; + uint16_t __pad1; + + union { + struct { + uint64_t cdb_off; + uint64_t iov_cnt; + struct iovec iov[0]; + } req; + struct { + uint8_t scsi_status; + uint8_t __pad1; + uint16_t __pad2; + uint32_t __pad3; + char sense_buffer[TCMU_SENSE_BUFFERSIZE]; + } rsp; + }; + +} __packed; + +#define TCMU_OP_ALIGN_SIZE sizeof(uint64_t) + +enum tcmu_genl_cmd { + TCMU_CMD_UNSPEC, + TCMU_CMD_ADDED_DEVICE, + TCMU_CMD_REMOVED_DEVICE, + __TCMU_CMD_MAX, +}; +#define TCMU_CMD_MAX (__TCMU_CMD_MAX - 1) + +enum tcmu_genl_attr { + TCMU_ATTR_UNSPEC, + TCMU_ATTR_DEVICE, + TCMU_ATTR_MINOR, + __TCMU_ATTR_MAX, +}; +#define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) + +#endif -- cgit v1.2.3 From 6e14eab90a933c2e936639be390bf231a377b44a Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Wed, 1 Oct 2014 23:01:15 -0700 Subject: target/user: Fix up smatch warnings in tcmu_netlink_event This patch fixes up the following unused return smatch warnings: drivers/target/target_core_user.c:778 tcmu_netlink_event warn: unused return: ret = nla_put_string() drivers/target/target_core_user.c:780 tcmu_netlink_event warn: unused `return: ret = nla_put_u32() (Fix up missing semicolon: grover) Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_user.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 6608ecf94570..ac37ce6be15c 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -763,27 +763,27 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int mino { struct sk_buff *skb; void *msg_header; - int ret; + int ret = -ENOMEM; skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) - return -ENOMEM; + return ret; msg_header = genlmsg_put(skb, 0, 0, &tcmu_genl_family, 0, cmd); - if (!msg_header) { - nlmsg_free(skb); - return -ENOMEM; - } + if (!msg_header) + goto free_skb; ret = nla_put_string(skb, TCMU_ATTR_DEVICE, name); + if (ret < 0) + goto free_skb; ret = nla_put_u32(skb, TCMU_ATTR_MINOR, minor); + if (ret < 0) + goto free_skb; ret = genlmsg_end(skb, msg_header); - if (ret < 0) { - nlmsg_free(skb); - return ret; - } + if (ret < 0) + goto free_skb; ret = genlmsg_multicast(&tcmu_genl_family, skb, 0, TCMU_MCGRP_CONFIG, GFP_KERNEL); @@ -793,6 +793,9 @@ static int tcmu_netlink_event(enum tcmu_genl_cmd cmd, const char *name, int mino ret = 0; return ret; +free_skb: + nlmsg_free(skb); + return ret; } static int tcmu_configure_device(struct se_device *dev) -- cgit v1.2.3 From 1acff63f6ec2622662e647364293cc3ca495401f Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 2 Oct 2014 21:40:34 -0700 Subject: iser-target: Fix smatch warning Unused return value from down_interruptible Reported-by: Or Gerlitz Signed-off-by: Sagi Grimberg Signed-off-by: Nicholas Bellinger --- drivers/infiniband/ulp/isert/ib_isert.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index d4c7928a0f36..40969b683fdd 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -3152,7 +3152,7 @@ isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn) accept_wait: ret = down_interruptible(&isert_np->np_sem); - if (max_accept > 5) + if (ret || max_accept > 5) return -ENODEV; spin_lock_bh(&np->np_thread_lock); -- cgit v1.2.3 From 6375f8908255ea7438b60bb5998e6b3e1628500d Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 2 Oct 2014 09:30:55 +0200 Subject: tcm_loop: Fixup tag handling The SCSI command tag is set to the tag assigned from the block layer, not the SCSI-II tag message. So we need to convert it into the correct SCSI-II tag message based on the device flags, not the tag value itself. Signed-off-by: Hannes Reinecke Reviewed-by: Sagi Grimberg Signed-off-by: Nicholas Bellinger --- drivers/target/loopback/tcm_loop.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/drivers/target/loopback/tcm_loop.c b/drivers/target/loopback/tcm_loop.c index a7f6dc646045..ab3ab27d49b7 100644 --- a/drivers/target/loopback/tcm_loop.c +++ b/drivers/target/loopback/tcm_loop.c @@ -153,18 +153,11 @@ static int tcm_loop_change_queue_type(struct scsi_device *sdev, int tag) /* * Locate the SAM Task Attr from struct scsi_cmnd * */ -static int tcm_loop_sam_attr(struct scsi_cmnd *sc) -{ - if (sc->device->tagged_supported) { - switch (sc->tag) { - case HEAD_OF_QUEUE_TAG: - return MSG_HEAD_TAG; - case ORDERED_QUEUE_TAG: - return MSG_ORDERED_TAG; - default: - break; - } - } +static int tcm_loop_sam_attr(struct scsi_cmnd *sc, int tag) +{ + if (sc->device->tagged_supported && + sc->device->ordered_tags && tag >= 0) + return MSG_ORDERED_TAG; return MSG_SIMPLE_TAG; } @@ -227,7 +220,7 @@ static void tcm_loop_submission_work(struct work_struct *work) rc = target_submit_cmd_map_sgls(se_cmd, tl_nexus->se_sess, sc->cmnd, &tl_cmd->tl_sense_buf[0], tl_cmd->sc->device->lun, - transfer_length, tcm_loop_sam_attr(sc), + transfer_length, tcm_loop_sam_attr(sc, tl_cmd->sc_cmd_tag), sc->sc_data_direction, 0, scsi_sglist(sc), scsi_sg_count(sc), sgl_bidi, sgl_bidi_count, @@ -266,7 +259,7 @@ static int tcm_loop_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc) } tl_cmd->sc = sc; - tl_cmd->sc_cmd_tag = sc->tag; + tl_cmd->sc_cmd_tag = sc->request->tag; INIT_WORK(&tl_cmd->work, tcm_loop_submission_work); queue_work(tcm_loop_workqueue, &tl_cmd->work); return 0; @@ -370,7 +363,7 @@ static int tcm_loop_abort_task(struct scsi_cmnd *sc) */ tl_tpg = &tl_hba->tl_hba_tpgs[sc->device->id]; ret = tcm_loop_issue_tmr(tl_tpg, tl_nexus, sc->device->lun, - sc->tag, TMR_ABORT_TASK); + sc->request->tag, TMR_ABORT_TASK); return (ret == TMR_FUNCTION_COMPLETE) ? SUCCESS : FAILED; } -- cgit v1.2.3 From f56574a2b554492703030e3d3b9679c9a07a5d69 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Thu, 2 Oct 2014 10:23:15 -0700 Subject: target/user: Recalculate pad size inside is_ring_space_avail() If more than one thread is waiting for command ring space that includes a PAD, then if the first one finishes (inserts a PAD and a CMD at the start of the cmd ring) then the second one will incorrectly think it still needs to insert a PAD (i.e. cmdr_space_needed is now wrong.) This will lead to it asking for more space than it actually needs, and then inserting a PAD somewhere else than at the end -- not what we want. This patch moves the pad calculation inside is_ring_space_available() so in the above scenario the second thread would then ask for space not including a PAD. The patch also inserts a PAD op based upon an up-to-date cmd_head, instead of the potentially stale value. Signed-off-by: Andy Grover Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_user.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index ac37ce6be15c..9a1b314f6482 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -236,16 +236,26 @@ static inline size_t head_to_end(size_t head, size_t size) * * Called with ring lock held. */ -static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_needed, size_t data_needed) +static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t data_needed) { struct tcmu_mailbox *mb = udev->mb_addr; size_t space; u32 cmd_head; + size_t cmd_needed; tcmu_flush_dcache_range(mb, sizeof(*mb)); cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ + /* + * If cmd end-of-ring space is too small then we need space for a NOP plus + * original cmd - cmds are internally contiguous. + */ + if (head_to_end(cmd_head, udev->cmdr_size) >= cmd_size) + cmd_needed = cmd_size; + else + cmd_needed = cmd_size + head_to_end(cmd_head, udev->cmdr_size); + space = spc_free(cmd_head, udev->cmdr_last_cleaned, udev->cmdr_size); if (space < cmd_needed) { pr_debug("no cmd space: %u %u %u\n", cmd_head, @@ -268,9 +278,7 @@ static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; struct se_cmd *se_cmd = tcmu_cmd->se_cmd; size_t base_command_size, command_size; - size_t cmdr_space_needed; struct tcmu_mailbox *mb; - size_t pad_size; struct tcmu_cmd_entry *entry; int i; struct scatterlist *sg; @@ -307,17 +315,7 @@ static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) "cmd/data ring buffers\n", command_size, tcmu_cmd->data_length, udev->cmdr_size, udev->data_size); - /* - * Cmd end-of-ring space is too small so we need space for a NOP plus - * original cmd - cmds are internally contiguous. - */ - if (head_to_end(cmd_head, udev->cmdr_size) >= command_size) - pad_size = 0; - else - pad_size = head_to_end(cmd_head, udev->cmdr_size); - cmdr_space_needed = command_size + pad_size; - - while (!is_ring_space_avail(udev, cmdr_space_needed, tcmu_cmd->data_length)) { + while (!is_ring_space_avail(udev, command_size, tcmu_cmd->data_length)) { int ret; DEFINE_WAIT(__wait); @@ -338,7 +336,10 @@ static int tcmu_queue_cmd_ring(struct tcmu_cmd *tcmu_cmd) cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */ } - if (pad_size) { + /* Insert a PAD if end-of-ring space is too small */ + if (head_to_end(cmd_head, udev->cmdr_size) < command_size) { + size_t pad_size = head_to_end(cmd_head, udev->cmdr_size); + entry = (void *) mb + CMDR_OFF + cmd_head; tcmu_flush_dcache_range(entry, sizeof(*entry)); tcmu_hdr_set_op(&entry->hdr, TCMU_OP_PAD); -- cgit v1.2.3 From 7efe665903d0d963b0ebf4cab25cc3ae32c62600 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 3 Oct 2014 13:06:33 +0100 Subject: rtc: Disable EFI rtc for x86 commit da167ad7638759 ("rtc: ia64: allow other architectures to use EFI RTC") inadvertently introduced a regression for x86 because we've been careful not to enable the EFI rtc driver due to the generally buggy implementations of the time-related EFI runtime services. In fact, since the above commit was merged we've seen reports of crashes on 32-bit tablets, https://bugzilla.kernel.org/show_bug.cgi?id=84241#c21 Disable it explicitly for x86 so that we don't give users false hope that this driver will work - it won't, and your machine is likely to crash. Acked-by: Mark Salter Cc: Dave Young Cc: Alessandro Zummo Cc: # v3.17 Signed-off-by: Matt Fleming --- drivers/rtc/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index a168e96142b9..54ef393b0def 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -806,7 +806,7 @@ config RTC_DRV_DA9063 config RTC_DRV_EFI tristate "EFI RTC" - depends on EFI + depends on EFI && !X86 help If you say yes here you will get support for the EFI Real Time Clock. -- cgit v1.2.3 From f4c24db1b7ad0ce84409e15744d26c6f86a96840 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 3 Oct 2014 14:35:56 -0700 Subject: qla_target: don't delete changed nacls The code is currently riddled with "drop the hardware_lock to avoid a deadlock" bugs that expose races. One of those races seems to expose a valid warning in tcm_qla2xxx_clear_nacl_from_fcport_map. Add some bandaid to it. Signed-off-by: Joern Engel Cc: # v3.5+ Signed-off-by: Nicholas Bellinger --- drivers/scsi/qla2xxx/tcm_qla2xxx.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/tcm_qla2xxx.c b/drivers/scsi/qla2xxx/tcm_qla2xxx.c index e2beab962096..4747d2c66024 100644 --- a/drivers/scsi/qla2xxx/tcm_qla2xxx.c +++ b/drivers/scsi/qla2xxx/tcm_qla2xxx.c @@ -757,7 +757,16 @@ static void tcm_qla2xxx_clear_nacl_from_fcport_map(struct qla_tgt_sess *sess) pr_debug("fc_rport domain: port_id 0x%06x\n", nacl->nport_id); node = btree_remove32(&lport->lport_fcport_map, nacl->nport_id); - WARN_ON(node && (node != se_nacl)); + if (WARN_ON(node && (node != se_nacl))) { + /* + * The nacl no longer matches what we think it should be. + * Most likely a new dynamic acl has been added while + * someone dropped the hardware lock. It clearly is a + * bug elsewhere, but this bit can't make things worse. + */ + btree_insert32(&lport->lport_fcport_map, nacl->nport_id, + node, GFP_ATOMIC); + } pr_debug("Removed from fcport_map: %p for WWNN: 0x%016LX, port_id: 0x%06x\n", se_nacl, nacl->nport_wwnn, nacl->nport_id); -- cgit v1.2.3 From e24805637d2d270d7975502e9024d473de86afdb Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sat, 4 Oct 2014 04:23:15 +0000 Subject: target: Fix APTPL metadata handling for dynamic MappedLUNs This patch fixes a bug in handling of SPC-3 PR Activate Persistence across Target Power Loss (APTPL) logic where re-creation of state for MappedLUNs from dynamically generated NodeACLs did not occur during I_T Nexus establishment. It adds the missing core_scsi3_check_aptpl_registration() call during core_tpg_check_initiator_node_acl() -> core_tpg_add_node_to_devs() in order to replay any pre-loaded APTPL metadata state associated with the newly connected SCSI Initiator Port. Cc: Mike Christie Cc: Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_device.c | 3 ++- drivers/target/target_core_pr.c | 6 +++--- drivers/target/target_core_pr.h | 2 +- drivers/target/target_core_tpg.c | 8 ++++++++ 4 files changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index f5057a2f4ed1..d18dd8b532a8 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -1399,7 +1399,8 @@ int core_dev_add_initiator_node_lun_acl( * Check to see if there are any existing persistent reservation APTPL * pre-registrations that need to be enabled for this LUN ACL.. */ - core_scsi3_check_aptpl_registration(lun->lun_se_dev, tpg, lun, lacl); + core_scsi3_check_aptpl_registration(lun->lun_se_dev, tpg, lun, nacl, + lacl->mapped_lun); return 0; } diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index 48a801045176..a06edb59b67f 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -937,10 +937,10 @@ int core_scsi3_check_aptpl_registration( struct se_device *dev, struct se_portal_group *tpg, struct se_lun *lun, - struct se_lun_acl *lun_acl) + struct se_node_acl *nacl, + u32 mapped_lun) { - struct se_node_acl *nacl = lun_acl->se_lun_nacl; - struct se_dev_entry *deve = nacl->device_list[lun_acl->mapped_lun]; + struct se_dev_entry *deve = nacl->device_list[mapped_lun]; if (dev->dev_reservation_flags & DRF_SPC2_RESERVATIONS) return 0; diff --git a/drivers/target/target_core_pr.h b/drivers/target/target_core_pr.h index 2ee2936fa0bd..749fd7bb7510 100644 --- a/drivers/target/target_core_pr.h +++ b/drivers/target/target_core_pr.h @@ -60,7 +60,7 @@ extern int core_scsi3_alloc_aptpl_registration( unsigned char *, u16, u32, int, int, u8); extern int core_scsi3_check_aptpl_registration(struct se_device *, struct se_portal_group *, struct se_lun *, - struct se_lun_acl *); + struct se_node_acl *, u32); extern void core_scsi3_free_pr_reg_from_nacl(struct se_device *, struct se_node_acl *); extern void core_scsi3_free_all_registrations(struct se_device *); diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index 6ec58736d1a4..aa2b2d0998e3 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c @@ -40,6 +40,7 @@ #include #include "target_core_internal.h" +#include "target_core_pr.h" extern struct se_device *g_lun0_dev; @@ -166,6 +167,13 @@ void core_tpg_add_node_to_devs( core_enable_device_list_for_node(lun, NULL, lun->unpacked_lun, lun_access, acl, tpg); + /* + * Check to see if there are any existing persistent reservation + * APTPL pre-registrations that need to be enabled for this dynamic + * LUN ACL now.. + */ + core_scsi3_check_aptpl_registration(dev, tpg, lun, acl, + lun->unpacked_lun); spin_lock(&tpg->tpg_lun_lock); } spin_unlock(&tpg->tpg_lun_lock); -- cgit v1.2.3 From 92404e609a2dffc55a9a22540ed48b6f0edc9c59 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sat, 4 Oct 2014 01:06:08 +0000 Subject: target: Add force_pr_aptpl device attribute This patch adds a force_pr_aptpl device attribute used to force SPC-3 PR Activate Persistence across Target Power Loss (APTPL) operation. This makes PR metadata write-out occur during state change regardless if new PERSISTENT_RESERVE_OUT CDBs have their APTPL feature bit set. This is useful during H/A failover in active/passive setups where all PR state is being re-created on a different node, driven by configfs backend device + export layout and pre-loaded $DEV/pr/res_aptpl_metadata. Cc: Mike Christie Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_configfs.c | 4 ++++ drivers/target/target_core_device.c | 18 ++++++++++++++++++ drivers/target/target_core_internal.h | 1 + drivers/target/target_core_pr.c | 9 +++++++++ include/target/target_core_base.h | 3 +++ 5 files changed, 35 insertions(+) diff --git a/drivers/target/target_core_configfs.c b/drivers/target/target_core_configfs.c index 291dc711fbc3..b30fc8f53bd8 100644 --- a/drivers/target/target_core_configfs.c +++ b/drivers/target/target_core_configfs.c @@ -665,6 +665,9 @@ SE_DEV_ATTR(is_nonrot, S_IRUGO | S_IWUSR); DEF_DEV_ATTRIB(emulate_rest_reord); SE_DEV_ATTR(emulate_rest_reord, S_IRUGO | S_IWUSR); +DEF_DEV_ATTRIB(force_pr_aptpl); +SE_DEV_ATTR(force_pr_aptpl, S_IRUGO | S_IWUSR); + DEF_DEV_ATTRIB_RO(hw_block_size); SE_DEV_ATTR_RO(hw_block_size); @@ -719,6 +722,7 @@ static struct configfs_attribute *target_core_dev_attrib_attrs[] = { &target_core_dev_attrib_hw_pi_prot_type.attr, &target_core_dev_attrib_pi_prot_format.attr, &target_core_dev_attrib_enforce_pr_isids.attr, + &target_core_dev_attrib_force_pr_aptpl.attr, &target_core_dev_attrib_is_nonrot.attr, &target_core_dev_attrib_emulate_rest_reord.attr, &target_core_dev_attrib_hw_block_size.attr, diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index d18dd8b532a8..c45f9e907e44 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -1018,6 +1018,23 @@ int se_dev_set_enforce_pr_isids(struct se_device *dev, int flag) return 0; } +int se_dev_set_force_pr_aptpl(struct se_device *dev, int flag) +{ + if ((flag != 0) && (flag != 1)) { + printk(KERN_ERR "Illegal value %d\n", flag); + return -EINVAL; + } + if (dev->export_count) { + pr_err("dev[%p]: Unable to set force_pr_aptpl while" + " export_count is %d\n", dev, dev->export_count); + return -EINVAL; + } + + dev->dev_attrib.force_pr_aptpl = flag; + pr_debug("dev[%p]: SE Device force_pr_aptpl: %d\n", dev, flag); + return 0; +} + int se_dev_set_is_nonrot(struct se_device *dev, int flag) { if ((flag != 0) && (flag != 1)) { @@ -1544,6 +1561,7 @@ struct se_device *target_alloc_device(struct se_hba *hba, const char *name) dev->dev_attrib.emulate_3pc = DA_EMULATE_3PC; dev->dev_attrib.pi_prot_type = TARGET_DIF_TYPE0_PROT; dev->dev_attrib.enforce_pr_isids = DA_ENFORCE_PR_ISIDS; + dev->dev_attrib.force_pr_aptpl = DA_FORCE_PR_APTPL; dev->dev_attrib.is_nonrot = DA_IS_NONROT; dev->dev_attrib.emulate_rest_reord = DA_EMULATE_REST_REORD; dev->dev_attrib.max_unmap_lba_count = DA_MAX_UNMAP_LBA_COUNT; diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index 42ef4ab70585..e31f42f369ff 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -38,6 +38,7 @@ int se_dev_set_emulate_3pc(struct se_device *, int); int se_dev_set_pi_prot_type(struct se_device *, int); int se_dev_set_pi_prot_format(struct se_device *, int); int se_dev_set_enforce_pr_isids(struct se_device *, int); +int se_dev_set_force_pr_aptpl(struct se_device *, int); int se_dev_set_is_nonrot(struct se_device *, int); int se_dev_set_emulate_rest_reord(struct se_device *dev, int); int se_dev_set_queue_depth(struct se_device *, u32); diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index a06edb59b67f..8c60a1a1ae8d 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -895,6 +895,7 @@ static int __core_scsi3_check_aptpl_registration( spin_lock(&pr_tmpl->aptpl_reg_lock); list_for_each_entry_safe(pr_reg, pr_reg_tmp, &pr_tmpl->aptpl_reg_list, pr_reg_aptpl_list) { + if (!strcmp(pr_reg->pr_iport, i_port) && (pr_reg->pr_res_mapped_lun == deve->mapped_lun) && !(strcmp(pr_reg->pr_tport, t_port)) && @@ -3470,6 +3471,7 @@ static unsigned long long core_scsi3_extract_reservation_key(unsigned char *cdb) sense_reason_t target_scsi3_emulate_pr_out(struct se_cmd *cmd) { + struct se_device *dev = cmd->se_dev; unsigned char *cdb = &cmd->t_task_cdb[0]; unsigned char *buf; u64 res_key, sa_res_key; @@ -3534,6 +3536,13 @@ target_scsi3_emulate_pr_out(struct se_cmd *cmd) aptpl = (buf[17] & 0x01); unreg = (buf[17] & 0x02); } + /* + * If the backend device has been configured to force APTPL metadata + * write-out, go ahead and propigate aptpl=1 down now. + */ + if (dev->dev_attrib.force_pr_aptpl) + aptpl = 1; + transport_kunmap_data_sg(cmd); buf = NULL; diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index b106240d8385..23c518a0340c 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -108,6 +108,8 @@ #define DA_EMULATE_ALUA 0 /* Enforce SCSI Initiator Port TransportID with 'ISID' for PR */ #define DA_ENFORCE_PR_ISIDS 1 +/* Force SPC-3 PR Activate Persistence across Target Power Loss */ +#define DA_FORCE_PR_APTPL 0 #define DA_STATUS_MAX_SECTORS_MIN 16 #define DA_STATUS_MAX_SECTORS_MAX 8192 /* By default don't report non-rotating (solid state) medium */ @@ -680,6 +682,7 @@ struct se_dev_attrib { enum target_prot_type pi_prot_type; enum target_prot_type hw_pi_prot_type; int enforce_pr_isids; + int force_pr_aptpl; int is_nonrot; int emulate_rest_reord; u32 hw_block_size; -- cgit v1.2.3 From 0d0f660d882c1c02748ced13966a2413aa5d6cc2 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Sun, 5 Oct 2014 02:13:03 -0700 Subject: iser-target: Disable TX completion interrupt coalescing This patch explicitly disables TX completion interrupt coalescing logic in isert_put_response() and isert_put_datain() that was originally added as an efficiency optimization in commit 95b60f07. It has been reported that this change can trigger ABORT_TASK timeouts under certain small block workloads, where disabling coalescing was required for stability. According to Sagi, this doesn't impact overall performance, so go ahead and disable it for now. Reported-by: Moussa Ba Reported-by: Sagi Grimberg Cc: # 3.13+ Signed-off-by: Nicholas Bellinger --- drivers/infiniband/ulp/isert/ib_isert.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 40969b683fdd..f7191128b725 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -2183,7 +2183,7 @@ isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd) isert_cmd->tx_desc.num_sge = 2; } - isert_init_send_wr(isert_conn, isert_cmd, send_wr, true); + isert_init_send_wr(isert_conn, isert_cmd, send_wr, false); pr_debug("Posting SCSI Response IB_WR_SEND >>>>>>>>>>>>>>>>>>>>>>\n"); @@ -2882,7 +2882,7 @@ isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd) &isert_cmd->tx_desc.iscsi_header); isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc); isert_init_send_wr(isert_conn, isert_cmd, - &isert_cmd->tx_desc.send_wr, true); + &isert_cmd->tx_desc.send_wr, false); isert_cmd->rdma_wr.s_send_wr.next = &isert_cmd->tx_desc.send_wr; wr->send_wr_num += 1; } -- cgit v1.2.3 From e2bfb088fac03c0f621886a04cffc7faa2b49b1d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 5 Oct 2014 22:47:07 -0400 Subject: ext4: don't orphan or truncate the boot loader inode The boot loader inode (inode #5) should never be visible in the directory hierarchy, but it's possible if the file system is corrupted that there will be a directory entry that points at inode #5. In order to avoid accidentally trashing it, when such a directory inode is opened, the inode will be marked as a bad inode, so that it's not possible to modify (or read) the inode from userspace. Unfortunately, when we unlink this (invalid/illegal) directory entry, we will put the bad inode on the ophan list, and then when try to unlink the directory, we don't actually remove the bad inode from the orphan list before freeing in-memory inode structure. This means the in-memory orphan list is corrupted, leading to a kernel oops. In addition, avoid truncating a bad inode in ext4_destroy_inode(), since truncating the boot loader inode is not a smart thing to do. Reported-by: Sami Liedes Reviewed-by: Jan Kara Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 7 +++---- fs/ext4/namei.c | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 41c4f97c39d3..59983b28a93c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -224,16 +224,15 @@ void ext4_evict_inode(struct inode *inode) goto no_delete; } - if (!is_bad_inode(inode)) - dquot_initialize(inode); + if (is_bad_inode(inode)) + goto no_delete; + dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); - if (is_bad_inode(inode)) - goto no_delete; /* * Protect us against freezing - iput() caller didn't have to have any diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 51705f8c4116..a2a9d40522d2 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2544,7 +2544,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) int err = 0, rc; bool dirty = false; - if (!sbi->s_journal) + if (!sbi->s_journal || is_bad_inode(inode)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && -- cgit v1.2.3 From f4bb2981024fc91b23b4d09a8817c415396dbabb Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 5 Oct 2014 22:56:00 -0400 Subject: ext4: add ext4_iget_normal() which is to be used for dir tree lookups If there is a corrupted file system which has directory entries that point at reserved, metadata inodes, prohibit them from being used by treating them the same way we treat Boot Loader inodes --- that is, mark them to be bad inodes. This prohibits them from being opened, deleted, or modified via chmod, chown, utimes, etc. In particular, this prevents a corrupted file system which has a directory entry which points at the journal inode from being deleted and its blocks released, after which point Much Hilarity Ensues. Reported-by: Sami Liedes Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 7 +++++++ fs/ext4/namei.c | 4 ++-- fs/ext4/super.c | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1eb5b7b912a8..012e89bd9644 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2109,6 +2109,7 @@ int do_journal_get_write_access(handle_t *handle, #define CONVERT_INLINE_DATA 2 extern struct inode *ext4_iget(struct super_block *, unsigned long); +extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 59983b28a93c..e204d8aabe7d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4104,6 +4104,13 @@ bad_inode: return ERR_PTR(ret); } +struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) +{ + if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) + return ERR_PTR(-EIO); + return ext4_iget(sb, ino); +} + static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a2a9d40522d2..7037ecf0fc23 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1417,7 +1417,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi dentry); return ERR_PTR(-EIO); } - inode = ext4_iget(dir->i_sb, ino); + inode = ext4_iget_normal(dir->i_sb, ino); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", @@ -1450,7 +1450,7 @@ struct dentry *ext4_get_parent(struct dentry *child) return ERR_PTR(-EIO); } - return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino)); } /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1070d6e521c6..a0811cc00c91 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1001,7 +1001,7 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb, * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ - inode = ext4_iget(sb, ino); + inode = ext4_iget_normal(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { -- cgit v1.2.3 From 673e7bbdb3920b62cfc6c710bea626b0a9b0f43a Mon Sep 17 00:00:00 2001 From: "U. Artie Eoff" Date: Mon, 29 Sep 2014 15:49:32 -0700 Subject: drm/i915: intel_backlight scale() math WA Improper truncated integer division in the scale() function causes actual_brightness != brightness. This (partial) work-around should be sufficient for a majority of use-cases, but it is by no means a complete solution. TODO: Determine how best to scale "user" values to "hw" values, and vice-versa, when the ranges are of different sizes. That would be a buggy scenario even with this work-around. The issue was introduced in the following (v3.17-rc1) commit: 6dda730 drm/i915: respect the VBT minimum backlight brightness Note that for easier backporting this commit adds a duplicated macro. A follow-up cleanup patch rectifies this for 3.18+ v2: (thanks to Chris Wilson) clarify commit message, use rounded division macro v3: -DIV_ROUND_CLOSEST() fails to build with CONFIG_X86_32=y. (Jani) -Use DIV_ROUND_CLOSEST_ULL() instead. (Damien) -v1 and v2 originally authored by Joe Konno. Signed-off-by: U. Artie Eoff Cc: stable@vger.kernel.org Reviewed-By: Joe Konno [danvet: Add backporting note.] Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_panel.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c index 18784470a760..97ff71f40dd3 100644 --- a/drivers/gpu/drm/i915/intel_panel.c +++ b/drivers/gpu/drm/i915/intel_panel.c @@ -398,6 +398,9 @@ intel_panel_detect(struct drm_device *dev) } } +#define DIV_ROUND_CLOSEST_ULL(ll, d) \ +({ unsigned long long _tmp = (ll)+(d)/2; do_div(_tmp, d); _tmp; }) + /** * scale - scale values from one range to another * @@ -419,9 +422,8 @@ static uint32_t scale(uint32_t source_val, source_val = clamp(source_val, source_min, source_max); /* avoid overflows */ - target_val = (uint64_t)(source_val - source_min) * - (target_max - target_min); - do_div(target_val, source_max - source_min); + target_val = DIV_ROUND_CLOSEST_ULL((uint64_t)(source_val - source_min) * + (target_max - target_min), source_max - source_min); target_val += target_min; return target_val; -- cgit v1.2.3 From 2e5416252af5239ddcd92bc858a494c1ca842778 Mon Sep 17 00:00:00 2001 From: "U. Artie Eoff" Date: Mon, 29 Sep 2014 15:49:33 -0700 Subject: drm/i915: Move DIV_ROUND_CLOSEST_ULL macro to header Move the duplicated DIV_ROUND_CLOSEST_ULL macro into the intel_drv.h header file so that it can be shared between intel_display.c and intel_panel.c. Signed-off-by: U. Artie Eoff Reviewed-By: Joe Konno Signed-off-by: Daniel Vetter --- drivers/gpu/drm/i915/intel_display.c | 3 --- drivers/gpu/drm/i915/intel_drv.h | 3 +++ drivers/gpu/drm/i915/intel_panel.c | 3 --- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 507370513f3d..6a0cc367934d 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -73,9 +73,6 @@ static const uint32_t intel_cursor_formats[] = { DRM_FORMAT_ARGB8888, }; -#define DIV_ROUND_CLOSEST_ULL(ll, d) \ -({ unsigned long long _tmp = (ll)+(d)/2; do_div(_tmp, d); _tmp; }) - static void intel_increase_pllclock(struct drm_device *dev, enum pipe pipe); static void intel_crtc_update_cursor(struct drm_crtc *crtc, bool on); diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h index 07ce04683c30..ba715229a540 100644 --- a/drivers/gpu/drm/i915/intel_drv.h +++ b/drivers/gpu/drm/i915/intel_drv.h @@ -35,6 +35,9 @@ #include #include +#define DIV_ROUND_CLOSEST_ULL(ll, d) \ +({ unsigned long long _tmp = (ll)+(d)/2; do_div(_tmp, d); _tmp; }) + /** * _wait_for - magic (register) wait macro * diff --git a/drivers/gpu/drm/i915/intel_panel.c b/drivers/gpu/drm/i915/intel_panel.c index 97ff71f40dd3..0e018cb49147 100644 --- a/drivers/gpu/drm/i915/intel_panel.c +++ b/drivers/gpu/drm/i915/intel_panel.c @@ -398,9 +398,6 @@ intel_panel_detect(struct drm_device *dev) } } -#define DIV_ROUND_CLOSEST_ULL(ll, d) \ -({ unsigned long long _tmp = (ll)+(d)/2; do_div(_tmp, d); _tmp; }) - /** * scale - scale values from one range to another * -- cgit v1.2.3 From f2fc42b6ac31f4d808da7a9da460dd433a71e976 Mon Sep 17 00:00:00 2001 From: Suman Anna Date: Thu, 12 Jun 2014 22:30:34 +0530 Subject: mailbox: rename pl320-ipc specific mailbox.h The patch 30058677 "ARM / highbank: add support for pl320 IPC" added a pl320 IPC specific header file as a generic mailbox.h. This file has been renamed appropriately to allow the introduction of the generic mailbox API framework. Acked-by: Mark Langsdorf Cc: Rafael J. Wysocki Signed-off-by: Suman Anna Reviewed-by: Mark Brown Acked-by: Arnd Bergmann --- arch/arm/mach-highbank/highbank.c | 2 +- drivers/cpufreq/highbank-cpufreq.c | 2 +- drivers/mailbox/pl320-ipc.c | 2 +- include/linux/mailbox.h | 17 ----------------- include/linux/pl320-ipc.h | 17 +++++++++++++++++ 5 files changed, 20 insertions(+), 20 deletions(-) delete mode 100644 include/linux/mailbox.h create mode 100644 include/linux/pl320-ipc.h diff --git a/arch/arm/mach-highbank/highbank.c b/arch/arm/mach-highbank/highbank.c index 8c35ae4ff176..07a09570175d 100644 --- a/arch/arm/mach-highbank/highbank.c +++ b/arch/arm/mach-highbank/highbank.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/cpufreq/highbank-cpufreq.c b/drivers/cpufreq/highbank-cpufreq.c index bf8902a0866d..b464f29d8d54 100644 --- a/drivers/cpufreq/highbank-cpufreq.c +++ b/drivers/cpufreq/highbank-cpufreq.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #define HB_CPUFREQ_CHANGE_NOTE 0x80000001 diff --git a/drivers/mailbox/pl320-ipc.c b/drivers/mailbox/pl320-ipc.c index d873cbae2fbb..f3755e0aa935 100644 --- a/drivers/mailbox/pl320-ipc.c +++ b/drivers/mailbox/pl320-ipc.c @@ -26,7 +26,7 @@ #include #include -#include +#include #define IPCMxSOURCE(m) ((m) * 0x40) #define IPCMxDSET(m) (((m) * 0x40) + 0x004) diff --git a/include/linux/mailbox.h b/include/linux/mailbox.h deleted file mode 100644 index 5161f63ec1c8..000000000000 --- a/include/linux/mailbox.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -int pl320_ipc_transmit(u32 *data); -int pl320_ipc_register_notifier(struct notifier_block *nb); -int pl320_ipc_unregister_notifier(struct notifier_block *nb); diff --git a/include/linux/pl320-ipc.h b/include/linux/pl320-ipc.h new file mode 100644 index 000000000000..5161f63ec1c8 --- /dev/null +++ b/include/linux/pl320-ipc.h @@ -0,0 +1,17 @@ +/* + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +int pl320_ipc_transmit(u32 *data); +int pl320_ipc_register_notifier(struct notifier_block *nb); +int pl320_ipc_unregister_notifier(struct notifier_block *nb); -- cgit v1.2.3 From 2b6d83e2b8b7de82331a6a1dcd64b51020a6031c Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Thu, 12 Jun 2014 22:31:19 +0530 Subject: mailbox: Introduce framework for mailbox Introduce common framework for client/protocol drivers and controller drivers of Inter-Processor-Communication (IPC). Client driver developers should have a look at include/linux/mailbox_client.h to understand the part of the API exposed to client drivers. Similarly controller driver developers should have a look at include/linux/mailbox_controller.h Reviewed-by: Mark Brown Signed-off-by: Jassi Brar --- MAINTAINERS | 8 + drivers/mailbox/Makefile | 4 + drivers/mailbox/mailbox.c | 465 +++++++++++++++++++++++++++++++++++++ include/linux/mailbox_client.h | 46 ++++ include/linux/mailbox_controller.h | 133 +++++++++++ 5 files changed, 656 insertions(+) create mode 100644 drivers/mailbox/mailbox.c create mode 100644 include/linux/mailbox_client.h create mode 100644 include/linux/mailbox_controller.h diff --git a/MAINTAINERS b/MAINTAINERS index a12edf2624e5..c5f7c854b6d2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5740,6 +5740,14 @@ S: Maintained F: drivers/net/macvlan.c F: include/linux/if_macvlan.h +MAILBOX API +M: Jassi Brar +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/mailbox/ +F: include/linux/mailbox_client.h +F: include/linux/mailbox_controller.h + MAN-PAGES: MANUAL PAGES FOR LINUX -- Sections 2, 3, 4, 5, and 7 M: Michael Kerrisk W: http://www.kernel.org/doc/man-pages diff --git a/drivers/mailbox/Makefile b/drivers/mailbox/Makefile index 6d184dbcaca8..94ed7cefb14d 100644 --- a/drivers/mailbox/Makefile +++ b/drivers/mailbox/Makefile @@ -1,3 +1,7 @@ +# Generic MAILBOX API + +obj-$(CONFIG_MAILBOX) += mailbox.o + obj-$(CONFIG_PL320_MBOX) += pl320-ipc.o obj-$(CONFIG_OMAP2PLUS_MBOX) += omap-mailbox.o diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c new file mode 100644 index 000000000000..afcb430508ec --- /dev/null +++ b/drivers/mailbox/mailbox.c @@ -0,0 +1,465 @@ +/* + * Mailbox: Common code for Mailbox controllers and users + * + * Copyright (C) 2013-2014 Linaro Ltd. + * Author: Jassi Brar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TXDONE_BY_IRQ BIT(0) /* controller has remote RTR irq */ +#define TXDONE_BY_POLL BIT(1) /* controller can read status of last TX */ +#define TXDONE_BY_ACK BIT(2) /* S/W ACK recevied by Client ticks the TX */ + +static LIST_HEAD(mbox_cons); +static DEFINE_MUTEX(con_mutex); + +static int add_to_rbuf(struct mbox_chan *chan, void *mssg) +{ + int idx; + unsigned long flags; + + spin_lock_irqsave(&chan->lock, flags); + + /* See if there is any space left */ + if (chan->msg_count == MBOX_TX_QUEUE_LEN) { + spin_unlock_irqrestore(&chan->lock, flags); + return -ENOBUFS; + } + + idx = chan->msg_free; + chan->msg_data[idx] = mssg; + chan->msg_count++; + + if (idx == MBOX_TX_QUEUE_LEN - 1) + chan->msg_free = 0; + else + chan->msg_free++; + + spin_unlock_irqrestore(&chan->lock, flags); + + return idx; +} + +static void msg_submit(struct mbox_chan *chan) +{ + unsigned count, idx; + unsigned long flags; + void *data; + int err; + + spin_lock_irqsave(&chan->lock, flags); + + if (!chan->msg_count || chan->active_req) + goto exit; + + count = chan->msg_count; + idx = chan->msg_free; + if (idx >= count) + idx -= count; + else + idx += MBOX_TX_QUEUE_LEN - count; + + data = chan->msg_data[idx]; + + /* Try to submit a message to the MBOX controller */ + err = chan->mbox->ops->send_data(chan, data); + if (!err) { + chan->active_req = data; + chan->msg_count--; + } +exit: + spin_unlock_irqrestore(&chan->lock, flags); +} + +static void tx_tick(struct mbox_chan *chan, int r) +{ + unsigned long flags; + void *mssg; + + spin_lock_irqsave(&chan->lock, flags); + mssg = chan->active_req; + chan->active_req = NULL; + spin_unlock_irqrestore(&chan->lock, flags); + + /* Submit next message */ + msg_submit(chan); + + /* Notify the client */ + if (mssg && chan->cl->tx_done) + chan->cl->tx_done(chan->cl, mssg, r); + + if (chan->cl->tx_block) + complete(&chan->tx_complete); +} + +static void poll_txdone(unsigned long data) +{ + struct mbox_controller *mbox = (struct mbox_controller *)data; + bool txdone, resched = false; + int i; + + for (i = 0; i < mbox->num_chans; i++) { + struct mbox_chan *chan = &mbox->chans[i]; + + if (chan->active_req && chan->cl) { + resched = true; + txdone = chan->mbox->ops->last_tx_done(chan); + if (txdone) + tx_tick(chan, 0); + } + } + + if (resched) + mod_timer(&mbox->poll, jiffies + + msecs_to_jiffies(mbox->txpoll_period)); +} + +/** + * mbox_chan_received_data - A way for controller driver to push data + * received from remote to the upper layer. + * @chan: Pointer to the mailbox channel on which RX happened. + * @mssg: Client specific message typecasted as void * + * + * After startup and before shutdown any data received on the chan + * is passed on to the API via atomic mbox_chan_received_data(). + * The controller should ACK the RX only after this call returns. + */ +void mbox_chan_received_data(struct mbox_chan *chan, void *mssg) +{ + /* No buffering the received data */ + if (chan->cl->rx_callback) + chan->cl->rx_callback(chan->cl, mssg); +} +EXPORT_SYMBOL_GPL(mbox_chan_received_data); + +/** + * mbox_chan_txdone - A way for controller driver to notify the + * framework that the last TX has completed. + * @chan: Pointer to the mailbox chan on which TX happened. + * @r: Status of last TX - OK or ERROR + * + * The controller that has IRQ for TX ACK calls this atomic API + * to tick the TX state machine. It works only if txdone_irq + * is set by the controller. + */ +void mbox_chan_txdone(struct mbox_chan *chan, int r) +{ + if (unlikely(!(chan->txdone_method & TXDONE_BY_IRQ))) { + dev_err(chan->mbox->dev, + "Controller can't run the TX ticker\n"); + return; + } + + tx_tick(chan, r); +} +EXPORT_SYMBOL_GPL(mbox_chan_txdone); + +/** + * mbox_client_txdone - The way for a client to run the TX state machine. + * @chan: Mailbox channel assigned to this client. + * @r: Success status of last transmission. + * + * The client/protocol had received some 'ACK' packet and it notifies + * the API that the last packet was sent successfully. This only works + * if the controller can't sense TX-Done. + */ +void mbox_client_txdone(struct mbox_chan *chan, int r) +{ + if (unlikely(!(chan->txdone_method & TXDONE_BY_ACK))) { + dev_err(chan->mbox->dev, "Client can't run the TX ticker\n"); + return; + } + + tx_tick(chan, r); +} +EXPORT_SYMBOL_GPL(mbox_client_txdone); + +/** + * mbox_client_peek_data - A way for client driver to pull data + * received from remote by the controller. + * @chan: Mailbox channel assigned to this client. + * + * A poke to controller driver for any received data. + * The data is actually passed onto client via the + * mbox_chan_received_data() + * The call can be made from atomic context, so the controller's + * implementation of peek_data() must not sleep. + * + * Return: True, if controller has, and is going to push after this, + * some data. + * False, if controller doesn't have any data to be read. + */ +bool mbox_client_peek_data(struct mbox_chan *chan) +{ + if (chan->mbox->ops->peek_data) + return chan->mbox->ops->peek_data(chan); + + return false; +} +EXPORT_SYMBOL_GPL(mbox_client_peek_data); + +/** + * mbox_send_message - For client to submit a message to be + * sent to the remote. + * @chan: Mailbox channel assigned to this client. + * @mssg: Client specific message typecasted. + * + * For client to submit data to the controller destined for a remote + * processor. If the client had set 'tx_block', the call will return + * either when the remote receives the data or when 'tx_tout' millisecs + * run out. + * In non-blocking mode, the requests are buffered by the API and a + * non-negative token is returned for each queued request. If the request + * is not queued, a negative token is returned. Upon failure or successful + * TX, the API calls 'tx_done' from atomic context, from which the client + * could submit yet another request. + * The pointer to message should be preserved until it is sent + * over the chan, i.e, tx_done() is made. + * This function could be called from atomic context as it simply + * queues the data and returns a token against the request. + * + * Return: Non-negative integer for successful submission (non-blocking mode) + * or transmission over chan (blocking mode). + * Negative value denotes failure. + */ +int mbox_send_message(struct mbox_chan *chan, void *mssg) +{ + int t; + + if (!chan || !chan->cl) + return -EINVAL; + + t = add_to_rbuf(chan, mssg); + if (t < 0) { + dev_err(chan->mbox->dev, "Try increasing MBOX_TX_QUEUE_LEN\n"); + return t; + } + + msg_submit(chan); + + if (chan->txdone_method == TXDONE_BY_POLL) + poll_txdone((unsigned long)chan->mbox); + + if (chan->cl->tx_block && chan->active_req) { + unsigned long wait; + int ret; + + if (!chan->cl->tx_tout) /* wait forever */ + wait = msecs_to_jiffies(3600000); + else + wait = msecs_to_jiffies(chan->cl->tx_tout); + + ret = wait_for_completion_timeout(&chan->tx_complete, wait); + if (ret == 0) { + t = -EIO; + tx_tick(chan, -EIO); + } + } + + return t; +} +EXPORT_SYMBOL_GPL(mbox_send_message); + +/** + * mbox_request_channel - Request a mailbox channel. + * @cl: Identity of the client requesting the channel. + * @index: Index of mailbox specifier in 'mboxes' property. + * + * The Client specifies its requirements and capabilities while asking for + * a mailbox channel. It can't be called from atomic context. + * The channel is exclusively allocated and can't be used by another + * client before the owner calls mbox_free_channel. + * After assignment, any packet received on this channel will be + * handed over to the client via the 'rx_callback'. + * The framework holds reference to the client, so the mbox_client + * structure shouldn't be modified until the mbox_free_channel returns. + * + * Return: Pointer to the channel assigned to the client if successful. + * ERR_PTR for request failure. + */ +struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index) +{ + struct device *dev = cl->dev; + struct mbox_controller *mbox; + struct of_phandle_args spec; + struct mbox_chan *chan; + unsigned long flags; + int ret; + + if (!dev || !dev->of_node) { + pr_debug("%s: No owner device node\n", __func__); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&con_mutex); + + if (of_parse_phandle_with_args(dev->of_node, "mboxes", + "#mbox-cells", index, &spec)) { + dev_dbg(dev, "%s: can't parse \"mboxes\" property\n", __func__); + mutex_unlock(&con_mutex); + return ERR_PTR(-ENODEV); + } + + chan = NULL; + list_for_each_entry(mbox, &mbox_cons, node) + if (mbox->dev->of_node == spec.np) { + chan = mbox->of_xlate(mbox, &spec); + break; + } + + of_node_put(spec.np); + + if (!chan || chan->cl || !try_module_get(mbox->dev->driver->owner)) { + dev_dbg(dev, "%s: mailbox not free\n", __func__); + mutex_unlock(&con_mutex); + return ERR_PTR(-EBUSY); + } + + spin_lock_irqsave(&chan->lock, flags); + chan->msg_free = 0; + chan->msg_count = 0; + chan->active_req = NULL; + chan->cl = cl; + init_completion(&chan->tx_complete); + + if (chan->txdone_method == TXDONE_BY_POLL && cl->knows_txdone) + chan->txdone_method |= TXDONE_BY_ACK; + + spin_unlock_irqrestore(&chan->lock, flags); + + ret = chan->mbox->ops->startup(chan); + if (ret) { + dev_err(dev, "Unable to startup the chan (%d)\n", ret); + mbox_free_channel(chan); + chan = ERR_PTR(ret); + } + + mutex_unlock(&con_mutex); + return chan; +} +EXPORT_SYMBOL_GPL(mbox_request_channel); + +/** + * mbox_free_channel - The client relinquishes control of a mailbox + * channel by this call. + * @chan: The mailbox channel to be freed. + */ +void mbox_free_channel(struct mbox_chan *chan) +{ + unsigned long flags; + + if (!chan || !chan->cl) + return; + + chan->mbox->ops->shutdown(chan); + + /* The queued TX requests are simply aborted, no callbacks are made */ + spin_lock_irqsave(&chan->lock, flags); + chan->cl = NULL; + chan->active_req = NULL; + if (chan->txdone_method == (TXDONE_BY_POLL | TXDONE_BY_ACK)) + chan->txdone_method = TXDONE_BY_POLL; + + module_put(chan->mbox->dev->driver->owner); + spin_unlock_irqrestore(&chan->lock, flags); +} +EXPORT_SYMBOL_GPL(mbox_free_channel); + +static struct mbox_chan * +of_mbox_index_xlate(struct mbox_controller *mbox, + const struct of_phandle_args *sp) +{ + int ind = sp->args[0]; + + if (ind >= mbox->num_chans) + return NULL; + + return &mbox->chans[ind]; +} + +/** + * mbox_controller_register - Register the mailbox controller + * @mbox: Pointer to the mailbox controller. + * + * The controller driver registers its communication channels + */ +int mbox_controller_register(struct mbox_controller *mbox) +{ + int i, txdone; + + /* Sanity check */ + if (!mbox || !mbox->dev || !mbox->ops || !mbox->num_chans) + return -EINVAL; + + if (mbox->txdone_irq) + txdone = TXDONE_BY_IRQ; + else if (mbox->txdone_poll) + txdone = TXDONE_BY_POLL; + else /* It has to be ACK then */ + txdone = TXDONE_BY_ACK; + + if (txdone == TXDONE_BY_POLL) { + mbox->poll.function = &poll_txdone; + mbox->poll.data = (unsigned long)mbox; + init_timer(&mbox->poll); + } + + for (i = 0; i < mbox->num_chans; i++) { + struct mbox_chan *chan = &mbox->chans[i]; + + chan->cl = NULL; + chan->mbox = mbox; + chan->txdone_method = txdone; + spin_lock_init(&chan->lock); + } + + if (!mbox->of_xlate) + mbox->of_xlate = of_mbox_index_xlate; + + mutex_lock(&con_mutex); + list_add_tail(&mbox->node, &mbox_cons); + mutex_unlock(&con_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(mbox_controller_register); + +/** + * mbox_controller_unregister - Unregister the mailbox controller + * @mbox: Pointer to the mailbox controller. + */ +void mbox_controller_unregister(struct mbox_controller *mbox) +{ + int i; + + if (!mbox) + return; + + mutex_lock(&con_mutex); + + list_del(&mbox->node); + + for (i = 0; i < mbox->num_chans; i++) + mbox_free_channel(&mbox->chans[i]); + + if (mbox->txdone_poll) + del_timer_sync(&mbox->poll); + + mutex_unlock(&con_mutex); +} +EXPORT_SYMBOL_GPL(mbox_controller_unregister); diff --git a/include/linux/mailbox_client.h b/include/linux/mailbox_client.h new file mode 100644 index 000000000000..307d9cab2026 --- /dev/null +++ b/include/linux/mailbox_client.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2013-2014 Linaro Ltd. + * Author: Jassi Brar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MAILBOX_CLIENT_H +#define __MAILBOX_CLIENT_H + +#include +#include + +struct mbox_chan; + +/** + * struct mbox_client - User of a mailbox + * @dev: The client device + * @tx_block: If the mbox_send_message should block until data is + * transmitted. + * @tx_tout: Max block period in ms before TX is assumed failure + * @knows_txdone: If the client could run the TX state machine. Usually + * if the client receives some ACK packet for transmission. + * Unused if the controller already has TX_Done/RTR IRQ. + * @rx_callback: Atomic callback to provide client the data received + * @tx_done: Atomic callback to tell client of data transmission + */ +struct mbox_client { + struct device *dev; + bool tx_block; + unsigned long tx_tout; + bool knows_txdone; + + void (*rx_callback)(struct mbox_client *cl, void *mssg); + void (*tx_done)(struct mbox_client *cl, void *mssg, int r); +}; + +struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index); +int mbox_send_message(struct mbox_chan *chan, void *mssg); +void mbox_client_txdone(struct mbox_chan *chan, int r); /* atomic */ +bool mbox_client_peek_data(struct mbox_chan *chan); /* atomic */ +void mbox_free_channel(struct mbox_chan *chan); /* may sleep */ + +#endif /* __MAILBOX_CLIENT_H */ diff --git a/include/linux/mailbox_controller.h b/include/linux/mailbox_controller.h new file mode 100644 index 000000000000..d4cf96f07cfc --- /dev/null +++ b/include/linux/mailbox_controller.h @@ -0,0 +1,133 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MAILBOX_CONTROLLER_H +#define __MAILBOX_CONTROLLER_H + +#include +#include +#include +#include +#include + +struct mbox_chan; + +/** + * struct mbox_chan_ops - methods to control mailbox channels + * @send_data: The API asks the MBOX controller driver, in atomic + * context try to transmit a message on the bus. Returns 0 if + * data is accepted for transmission, -EBUSY while rejecting + * if the remote hasn't yet read the last data sent. Actual + * transmission of data is reported by the controller via + * mbox_chan_txdone (if it has some TX ACK irq). It must not + * sleep. + * @startup: Called when a client requests the chan. The controller + * could ask clients for additional parameters of communication + * to be provided via client's chan_data. This call may + * block. After this call the Controller must forward any + * data received on the chan by calling mbox_chan_received_data. + * The controller may do stuff that need to sleep. + * @shutdown: Called when a client relinquishes control of a chan. + * This call may block too. The controller must not forward + * any received data anymore. + * The controller may do stuff that need to sleep. + * @last_tx_done: If the controller sets 'txdone_poll', the API calls + * this to poll status of last TX. The controller must + * give priority to IRQ method over polling and never + * set both txdone_poll and txdone_irq. Only in polling + * mode 'send_data' is expected to return -EBUSY. + * The controller may do stuff that need to sleep/block. + * Used only if txdone_poll:=true && txdone_irq:=false + * @peek_data: Atomic check for any received data. Return true if controller + * has some data to push to the client. False otherwise. + */ +struct mbox_chan_ops { + int (*send_data)(struct mbox_chan *chan, void *data); + int (*startup)(struct mbox_chan *chan); + void (*shutdown)(struct mbox_chan *chan); + bool (*last_tx_done)(struct mbox_chan *chan); + bool (*peek_data)(struct mbox_chan *chan); +}; + +/** + * struct mbox_controller - Controller of a class of communication channels + * @dev: Device backing this controller + * @ops: Operators that work on each communication chan + * @chans: Array of channels + * @num_chans: Number of channels in the 'chans' array. + * @txdone_irq: Indicates if the controller can report to API when + * the last transmitted data was read by the remote. + * Eg, if it has some TX ACK irq. + * @txdone_poll: If the controller can read but not report the TX + * done. Ex, some register shows the TX status but + * no interrupt rises. Ignored if 'txdone_irq' is set. + * @txpoll_period: If 'txdone_poll' is in effect, the API polls for + * last TX's status after these many millisecs + * @of_xlate: Controller driver specific mapping of channel via DT + * @poll: API private. Used to poll for TXDONE on all channels. + * @node: API private. To hook into list of controllers. + */ +struct mbox_controller { + struct device *dev; + struct mbox_chan_ops *ops; + struct mbox_chan *chans; + int num_chans; + bool txdone_irq; + bool txdone_poll; + unsigned txpoll_period; + struct mbox_chan *(*of_xlate)(struct mbox_controller *mbox, + const struct of_phandle_args *sp); + /* Internal to API */ + struct timer_list poll; + struct list_head node; +}; + +/* + * The length of circular buffer for queuing messages from a client. + * 'msg_count' tracks the number of buffered messages while 'msg_free' + * is the index where the next message would be buffered. + * We shouldn't need it too big because every transfer is interrupt + * triggered and if we have lots of data to transfer, the interrupt + * latencies are going to be the bottleneck, not the buffer length. + * Besides, mbox_send_message could be called from atomic context and + * the client could also queue another message from the notifier 'tx_done' + * of the last transfer done. + * REVISIT: If too many platforms see the "Try increasing MBOX_TX_QUEUE_LEN" + * print, it needs to be taken from config option or somesuch. + */ +#define MBOX_TX_QUEUE_LEN 20 + +/** + * struct mbox_chan - s/w representation of a communication chan + * @mbox: Pointer to the parent/provider of this channel + * @txdone_method: Way to detect TXDone chosen by the API + * @cl: Pointer to the current owner of this channel + * @tx_complete: Transmission completion + * @active_req: Currently active request hook + * @msg_count: No. of mssg currently queued + * @msg_free: Index of next available mssg slot + * @msg_data: Hook for data packet + * @lock: Serialise access to the channel + * @con_priv: Hook for controller driver to attach private data + */ +struct mbox_chan { + struct mbox_controller *mbox; + unsigned txdone_method; + struct mbox_client *cl; + struct completion tx_complete; + void *active_req; + unsigned msg_count, msg_free; + void *msg_data[MBOX_TX_QUEUE_LEN]; + spinlock_t lock; /* Serialise access to the channel */ + void *con_priv; +}; + +int mbox_controller_register(struct mbox_controller *mbox); /* can sleep */ +void mbox_controller_unregister(struct mbox_controller *mbox); /* can sleep */ +void mbox_chan_received_data(struct mbox_chan *chan, void *data); /* atomic */ +void mbox_chan_txdone(struct mbox_chan *chan, int r); /* atomic */ + +#endif /* __MAILBOX_CONTROLLER_H */ -- cgit v1.2.3 From 15320fbcec69dc3a4f217044ed848e4225397e25 Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Tue, 22 Jul 2014 20:05:58 +0530 Subject: doc: add documentation for mailbox framework Some explanations with examples of how to write to implement users and providers of the mailbox framework. Signed-off-by: Jassi Brar --- Documentation/mailbox.txt | 122 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 Documentation/mailbox.txt diff --git a/Documentation/mailbox.txt b/Documentation/mailbox.txt new file mode 100644 index 000000000000..60f43ff629aa --- /dev/null +++ b/Documentation/mailbox.txt @@ -0,0 +1,122 @@ + The Common Mailbox Framework + Jassi Brar + + This document aims to help developers write client and controller +drivers for the API. But before we start, let us note that the +client (especially) and controller drivers are likely going to be +very platform specific because the remote firmware is likely to be +proprietary and implement non-standard protocol. So even if two +platforms employ, say, PL320 controller, the client drivers can't +be shared across them. Even the PL320 driver might need to accommodate +some platform specific quirks. So the API is meant mainly to avoid +similar copies of code written for each platform. Having said that, +nothing prevents the remote f/w to also be Linux based and use the +same api there. However none of that helps us locally because we only +ever deal at client's protocol level. + Some of the choices made during implementation are the result of this +peculiarity of this "common" framework. + + + + Part 1 - Controller Driver (See include/linux/mailbox_controller.h) + + Allocate mbox_controller and the array of mbox_chan. +Populate mbox_chan_ops, except peek_data() all are mandatory. +The controller driver might know a message has been consumed +by the remote by getting an IRQ or polling some hardware flag +or it can never know (the client knows by way of the protocol). +The method in order of preference is IRQ -> Poll -> None, which +the controller driver should set via 'txdone_irq' or 'txdone_poll' +or neither. + + + Part 2 - Client Driver (See include/linux/mailbox_client.h) + + The client might want to operate in blocking mode (synchronously +send a message through before returning) or non-blocking/async mode (submit +a message and a callback function to the API and return immediately). + + +struct demo_client { + struct mbox_client cl; + struct mbox_chan *mbox; + struct completion c; + bool async; + /* ... */ +}; + +/* + * This is the handler for data received from remote. The behaviour is purely + * dependent upon the protocol. This is just an example. + */ +static void message_from_remote(struct mbox_client *cl, void *mssg) +{ + struct demo_client *dc = container_of(mbox_client, + struct demo_client, cl); + if (dc->aysnc) { + if (is_an_ack(mssg)) { + /* An ACK to our last sample sent */ + return; /* Or do something else here */ + } else { /* A new message from remote */ + queue_req(mssg); + } + } else { + /* Remote f/w sends only ACK packets on this channel */ + return; + } +} + +static void sample_sent(struct mbox_client *cl, void *mssg, int r) +{ + struct demo_client *dc = container_of(mbox_client, + struct demo_client, cl); + complete(&dc->c); +} + +static void client_demo(struct platform_device *pdev) +{ + struct demo_client *dc_sync, *dc_async; + /* The controller already knows async_pkt and sync_pkt */ + struct async_pkt ap; + struct sync_pkt sp; + + dc_sync = kzalloc(sizeof(*dc_sync), GFP_KERNEL); + dc_async = kzalloc(sizeof(*dc_async), GFP_KERNEL); + + /* Populate non-blocking mode client */ + dc_async->cl.dev = &pdev->dev; + dc_async->cl.rx_callback = message_from_remote; + dc_async->cl.tx_done = sample_sent; + dc_async->cl.tx_block = false; + dc_async->cl.tx_tout = 0; /* doesn't matter here */ + dc_async->cl.knows_txdone = false; /* depending upon protocol */ + dc_async->async = true; + init_completion(&dc_async->c); + + /* Populate blocking mode client */ + dc_sync->cl.dev = &pdev->dev; + dc_sync->cl.rx_callback = message_from_remote; + dc_sync->cl.tx_done = NULL; /* operate in blocking mode */ + dc_sync->cl.tx_block = true; + dc_sync->cl.tx_tout = 500; /* by half a second */ + dc_sync->cl.knows_txdone = false; /* depending upon protocol */ + dc_sync->async = false; + + /* ASync mailbox is listed second in 'mboxes' property */ + dc_async->mbox = mbox_request_channel(&dc_async->cl, 1); + /* Populate data packet */ + /* ap.xxx = 123; etc */ + /* Send async message to remote */ + mbox_send_message(dc_async->mbox, &ap); + + /* Sync mailbox is listed first in 'mboxes' property */ + dc_sync->mbox = mbox_request_channel(&dc_sync->cl, 0); + /* Populate data packet */ + /* sp.abc = 123; etc */ + /* Send message to remote in blocking mode */ + mbox_send_message(dc_sync->mbox, &sp); + /* At this point 'sp' has been sent */ + + /* Now wait for async chan to be done */ + wait_for_completion(&dc_async->c); +} -- cgit v1.2.3 From 9f3e3cacb2ffdefe28c7cf490bf543e4dcb2770a Mon Sep 17 00:00:00 2001 From: Jassi Brar Date: Tue, 22 Jul 2014 20:40:04 +0530 Subject: dt: mailbox: add generic bindings Define generic bindings for the framework clients to request mailbox channels. Reviewed-by: Mark Brown Signed-off-by: Jassi Brar --- .../devicetree/bindings/mailbox/mailbox.txt | 38 ++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 Documentation/devicetree/bindings/mailbox/mailbox.txt diff --git a/Documentation/devicetree/bindings/mailbox/mailbox.txt b/Documentation/devicetree/bindings/mailbox/mailbox.txt new file mode 100644 index 000000000000..1a2cd3d266db --- /dev/null +++ b/Documentation/devicetree/bindings/mailbox/mailbox.txt @@ -0,0 +1,38 @@ +* Generic Mailbox Controller and client driver bindings + +Generic binding to provide a way for Mailbox controller drivers to +assign appropriate mailbox channel to client drivers. + +* Mailbox Controller + +Required property: +- #mbox-cells: Must be at least 1. Number of cells in a mailbox + specifier. + +Example: + mailbox: mailbox { + ... + #mbox-cells = <1>; + }; + + +* Mailbox Client + +Required property: +- mboxes: List of phandle and mailbox channel specifiers. + +Optional property: +- mbox-names: List of identifier strings for each mailbox channel + required by the client. The use of this property + is discouraged in favor of using index in list of + 'mboxes' while requesting a mailbox. Instead the + platforms may define channel indices, in DT headers, + to something legible. + +Example: + pwr_cntrl: power { + ... + mbox-names = "pwr-ctrl", "rpc"; + mboxes = <&mailbox 0 + &mailbox 1>; + }; -- cgit v1.2.3 From 62d3ab49b8a5438d11a11605ea1a6d2fe0118f32 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 6 Oct 2014 16:40:13 -0700 Subject: target/file: fix inclusive vfs_fsync_range() end Both of the file target's calls to vfs_fsync_range() got the end offset off by one. The range is inclusive, not exclusive. It would sync a bit more data than was required. The sync path already tested the length of the range and fell back to LLONG_MAX so I copied that pattern in the rw path. This is untested. I found the errors by inspection while following other code. Signed-off-by: Zach Brown Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_file.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c index ab2c53b63cc3..72c83d98662b 100644 --- a/drivers/target/target_core_file.c +++ b/drivers/target/target_core_file.c @@ -415,7 +415,7 @@ fd_execute_sync_cache(struct se_cmd *cmd) } else { start = cmd->t_task_lba * dev->dev_attrib.block_size; if (cmd->data_length) - end = start + cmd->data_length; + end = start + cmd->data_length - 1; else end = LLONG_MAX; } @@ -680,7 +680,12 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents, struct fd_dev *fd_dev = FD_DEV(dev); loff_t start = cmd->t_task_lba * dev->dev_attrib.block_size; - loff_t end = start + cmd->data_length; + loff_t end; + + if (cmd->data_length) + end = start + cmd->data_length - 1; + else + end = LLONG_MAX; vfs_fsync_range(fd_dev->fd_file, start, end, 1); } -- cgit v1.2.3 From 1180e20606fd7c5d76dc5b2a1594fa51ba5a0f31 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Tue, 7 Oct 2014 18:02:52 -0300 Subject: drm/i915: properly reenable gen8 pipe IRQs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were missing the pipe B/C vblank bits! Take a look at gen8_de_irq_postinstall for a comparison. This should fix a bunch of IGT tests. There are a few more things we could improve on this code, but this should be the minimal fix to unblock us. v2: s/extra_iir/extra_ier/ because IIR doesn't make sense (Ville) Bugzilla:https://bugs.freedesktop.org/show_bug.cgi?id=83640 Testcase: igt/* Cc: stable@vger.kernel.org Signed-off-by: Paulo Zanoni Reviewed-by: Ville Syrjälä Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_irq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 3201986bf25e..54bf54706d1d 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -3458,12 +3458,13 @@ static void gen8_irq_reset(struct drm_device *dev) void gen8_irq_power_well_post_enable(struct drm_i915_private *dev_priv) { unsigned long irqflags; + uint32_t extra_ier = GEN8_PIPE_VBLANK | GEN8_PIPE_FIFO_UNDERRUN; spin_lock_irqsave(&dev_priv->irq_lock, irqflags); GEN8_IRQ_INIT_NDX(DE_PIPE, PIPE_B, dev_priv->de_irq_mask[PIPE_B], - ~dev_priv->de_irq_mask[PIPE_B]); + ~dev_priv->de_irq_mask[PIPE_B] | extra_ier); GEN8_IRQ_INIT_NDX(DE_PIPE, PIPE_C, dev_priv->de_irq_mask[PIPE_C], - ~dev_priv->de_irq_mask[PIPE_C]); + ~dev_priv->de_irq_mask[PIPE_C] | extra_ier); spin_unlock_irqrestore(&dev_priv->irq_lock, irqflags); } -- cgit v1.2.3 From 26bb0e9a1a938ec98ee07aa76533f1a711fba706 Mon Sep 17 00:00:00 2001 From: Lukasz Majewski Date: Wed, 24 Sep 2014 10:27:10 +0200 Subject: thermal: step_wise: fix: Prevent from binary overflow when trend is dropping It turns out that some boards can have instance->lower greater than 0 and when thermal trend is dropping it results with next_target equal to -1. Since the next_target is defined as unsigned long it is interpreted as 0xFFFFFFFF and larger than instance->upper. As a result the next_target is set to instance->upper which ramps up to maximal cooling device target when the temperature is steadily decreasing. Signed-off-by: Lukasz Majewski Signed-off-by: Zhang Rui --- drivers/thermal/step_wise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/step_wise.c b/drivers/thermal/step_wise.c index f251521baaa2..6705a0d746b3 100644 --- a/drivers/thermal/step_wise.c +++ b/drivers/thermal/step_wise.c @@ -76,7 +76,7 @@ static unsigned long get_target_state(struct thermal_instance *instance, next_target = instance->upper; break; case THERMAL_TREND_DROPPING: - if (cur_state == instance->lower) { + if (cur_state <= instance->lower) { if (!throttle) next_target = THERMAL_NO_TARGET; } else { -- cgit v1.2.3 From 083bf668cb70e47b84db64856606e94beac87f01 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 14 Mar 2014 14:06:25 +0800 Subject: ACPI: make acpi_create_platform_device() an external API Signed-off-by: Zhang Rui --- drivers/acpi/acpi_platform.c | 1 + drivers/acpi/internal.h | 7 ------- include/linux/acpi.h | 1 + 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c index 2bf9082f7523..a3c89a1bcf54 100644 --- a/drivers/acpi/acpi_platform.c +++ b/drivers/acpi/acpi_platform.c @@ -113,3 +113,4 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev) kfree(resources); return pdev; } +EXPORT_SYMBOL_GPL(acpi_create_platform_device); diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index de47f9f746c9..f221d1eb594a 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -168,13 +168,6 @@ static inline int suspend_nvs_save(void) { return 0; } static inline void suspend_nvs_restore(void) {} #endif -/*-------------------------------------------------------------------------- - Platform bus support - -------------------------------------------------------------------------- */ -struct platform_device; - -struct platform_device *acpi_create_platform_device(struct acpi_device *adev); - /*-------------------------------------------------------------------------- Video -------------------------------------------------------------------------- */ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 807cbc46d73e..2c24c2c1be45 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -432,6 +432,7 @@ static inline bool acpi_driver_match_device(struct device *dev, int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *); int acpi_device_modalias(struct device *, char *, int); +struct platform_device *acpi_create_platform_device(struct acpi_device *); #define ACPI_PTR(_ptr) (_ptr) #else /* !CONFIG_ACPI */ -- cgit v1.2.3 From e3ec483a7e24c6ebb5eb763ee56c65c239701066 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 16 Mar 2014 21:34:16 +0800 Subject: ACPI: add ACPI_TYPE_LOCAL_REFERENCE support to acpi_extract_package() Add ACPI_TYPE_LOCAL_REFERENCE support to acpi_extract_package(), so that we can use this helper for more cases like _ART/_TRT. Signed-off-by: Zhang Rui --- drivers/acpi/utils.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index 07c8c5a5ee95..1ed7aba99d8d 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -149,6 +149,21 @@ acpi_extract_package(union acpi_object *package, break; } break; + case ACPI_TYPE_LOCAL_REFERENCE: + switch (format_string[i]) { + case 'R': + size_required += sizeof(void *); + tail_offset += sizeof(void *); + break; + default: + printk(KERN_WARNING PREFIX "Invalid package element" + " [%d] got reference," + " expecting [%c]\n", + i, format_string[i]); + return AE_BAD_DATA; + break; + } + break; case ACPI_TYPE_PACKAGE: default: @@ -247,7 +262,18 @@ acpi_extract_package(union acpi_object *package, break; } break; - + case ACPI_TYPE_LOCAL_REFERENCE: + switch (format_string[i]) { + case 'R': + *(void **)head = + (void *)element->reference.handle; + head += sizeof(void *); + break; + default: + /* Should never get here */ + break; + } + break; case ACPI_TYPE_PACKAGE: /* TBD: handle nested packages... */ default: -- cgit v1.2.3 From 816cab931f288c92a3404b1b984576f4822b0445 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Fri, 14 Mar 2014 12:45:05 +0800 Subject: Thermal: introduce int3400 thermal driver Introduce int3400 thermal driver. And make INT3400 driver enumerate the other int340x thermal components shown in _ART/_TRT. Signed-off-by: Zhang Rui --- drivers/acpi/int340x_thermal.c | 2 +- drivers/thermal/Kconfig | 2 +- drivers/thermal/Makefile | 1 + drivers/thermal/int340x_thermal/Makefile | 1 + drivers/thermal/int340x_thermal/int3400_thermal.c | 245 ++++++++++++++++++++++ 5 files changed, 249 insertions(+), 2 deletions(-) create mode 100644 drivers/thermal/int340x_thermal/Makefile create mode 100644 drivers/thermal/int340x_thermal/int3400_thermal.c diff --git a/drivers/acpi/int340x_thermal.c b/drivers/acpi/int340x_thermal.c index 2103bb6d9016..a27d31d1ba24 100644 --- a/drivers/acpi/int340x_thermal.c +++ b/drivers/acpi/int340x_thermal.c @@ -33,7 +33,7 @@ static const struct acpi_device_id int340x_thermal_device_ids[] = { static int int340x_thermal_handler_attach(struct acpi_device *adev, const struct acpi_device_id *id) { -#ifdef CONFIG_INT340X_THERMAL +#if defined(CONFIG_INT340X_THERMAL) || defined(CONFIG_INT340X_THERMAL_MODULE) if (id->driver_data == DO_ENUMERATION) acpi_create_platform_device(adev); #endif diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 2ff7416ca930..6f5a87a8f19f 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -235,7 +235,7 @@ config INTEL_SOC_DTS_THERMAL was set by the driver based on the TJ MAX temperature. config INT340X_THERMAL - bool + tristate "ACPI INT340X thermal drivers" depends on X86 && ACPI help Newer laptops and tablets that use ACPI may have thermal sensors and diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 31e232f84b6b..216503eaa232 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -32,4 +32,5 @@ obj-$(CONFIG_X86_PKG_TEMP_THERMAL) += x86_pkg_temp_thermal.o obj-$(CONFIG_INTEL_SOC_DTS_THERMAL) += intel_soc_dts_thermal.o obj-$(CONFIG_TI_SOC_THERMAL) += ti-soc-thermal/ obj-$(CONFIG_ACPI_INT3403_THERMAL) += int3403_thermal.o +obj-$(CONFIG_INT340X_THERMAL) += int340x_thermal/ obj-$(CONFIG_ST_THERMAL) += st/ diff --git a/drivers/thermal/int340x_thermal/Makefile b/drivers/thermal/int340x_thermal/Makefile new file mode 100644 index 000000000000..e10a53bcefe7 --- /dev/null +++ b/drivers/thermal/int340x_thermal/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_INT340X_THERMAL) += int3400_thermal.o diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c new file mode 100644 index 000000000000..308c1850edee --- /dev/null +++ b/drivers/thermal/int340x_thermal/int3400_thermal.c @@ -0,0 +1,245 @@ +/* + * INT3400 thermal driver + * + * Copyright (C) 2014, Intel Corporation + * Authors: Zhang Rui + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include + +struct art { + acpi_handle source; + acpi_handle target; + u64 weight; + u64 ac0_max; + u64 ac1_max; + u64 ac2_max; + u64 ac3_max; + u64 ac4_max; + u64 ac5_max; + u64 ac6_max; + u64 ac7_max; + u64 ac8_max; + u64 ac9_max; +}; + +struct trt { + acpi_handle source; + acpi_handle target; + u64 influence; + u64 sampling_period; + u64 reverved1; + u64 reverved2; + u64 reverved3; + u64 reverved4; +}; + +struct int3400_thermal_priv { + struct acpi_device *adev; + int art_count; + struct art *arts; + int trt_count; + struct trt *trts; +}; + +static int parse_art(struct int3400_thermal_priv *priv) +{ + acpi_handle handle = priv->adev->handle; + acpi_status status; + int result = 0; + int i; + struct acpi_device *adev; + union acpi_object *p; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_buffer element = { 0, NULL }; + struct acpi_buffer art_format = { + sizeof("RRNNNNNNNNNNN"), "RRNNNNNNNNNNN" }; + + if (!acpi_has_method(handle, "_ART")) + return 0; + + status = acpi_evaluate_object(handle, "_ART", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + p = buffer.pointer; + if (!p || (p->type != ACPI_TYPE_PACKAGE)) { + pr_err("Invalid _ART data\n"); + result = -EFAULT; + goto end; + } + + /* ignore p->package.elements[0], as this is _ART Revision field */ + priv->art_count = p->package.count - 1; + priv->arts = kzalloc(sizeof(struct art) * priv->art_count, GFP_KERNEL); + if (!priv->arts) { + result = -ENOMEM; + goto end; + } + + for (i = 0; i < priv->art_count; i++) { + struct art *art = &(priv->arts[i]); + + element.length = sizeof(struct art); + element.pointer = art; + + status = acpi_extract_package(&(p->package.elements[i + 1]), + &art_format, &element); + if (ACPI_FAILURE(status)) { + pr_err("Invalid _ART data"); + result = -EFAULT; + kfree(priv->arts); + goto end; + } + result = acpi_bus_get_device(art->source, &adev); + if (!result) + acpi_create_platform_device(adev, NULL); + else + pr_warn("Failed to get source ACPI device\n"); + result = acpi_bus_get_device(art->target, &adev); + if (!result) + acpi_create_platform_device(adev, NULL); + else + pr_warn("Failed to get source ACPI device\n"); + } +end: + kfree(buffer.pointer); + return result; +} + +static int parse_trt(struct int3400_thermal_priv *priv) +{ + acpi_handle handle = priv->adev->handle; + acpi_status status; + int result = 0; + int i; + struct acpi_device *adev; + union acpi_object *p; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_buffer element = { 0, NULL }; + struct acpi_buffer trt_format = { sizeof("RRNNNNNN"), "RRNNNNNN" }; + + if (!acpi_has_method(handle, "_TRT")) + return 0; + + status = acpi_evaluate_object(handle, "_TRT", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + p = buffer.pointer; + if (!p || (p->type != ACPI_TYPE_PACKAGE)) { + pr_err("Invalid _TRT data\n"); + result = -EFAULT; + goto end; + } + + priv->trt_count = p->package.count; + priv->trts = kzalloc(sizeof(struct trt) * priv->trt_count, GFP_KERNEL); + if (!priv->trts) { + result = -ENOMEM; + goto end; + } + + for (i = 0; i < priv->trt_count; i++) { + struct trt *trt = &(priv->trts[i]); + + element.length = sizeof(struct trt); + element.pointer = trt; + + status = acpi_extract_package(&(p->package.elements[i]), + &trt_format, &element); + if (ACPI_FAILURE(status)) { + pr_err("Invalid _ART data"); + result = -EFAULT; + kfree(priv->trts); + goto end; + } + + result = acpi_bus_get_device(trt->source, &adev); + if (!result) + acpi_create_platform_device(adev, NULL); + else + pr_warn("Failed to get source ACPI device\n"); + result = acpi_bus_get_device(trt->target, &adev); + if (!result) + acpi_create_platform_device(adev, NULL); + else + pr_warn("Failed to get target ACPI device\n"); + } +end: + kfree(buffer.pointer); + return result; +} + +static int int3400_thermal_probe(struct platform_device *pdev) +{ + struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); + struct int3400_thermal_priv *priv; + int result; + + if (!adev) + return -ENODEV; + + priv = kzalloc(sizeof(struct int3400_thermal_priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->adev = adev; + + result = parse_art(priv); + if (result) + goto free_priv; + + result = parse_trt(priv); + if (result) + goto free_art; + + platform_set_drvdata(pdev, priv); + + return 0; +free_art: + kfree(priv->arts); +free_priv: + kfree(priv); + return result; +} + +static int int3400_thermal_remove(struct platform_device *pdev) +{ + struct int3400_thermal_priv *priv = platform_get_drvdata(pdev); + + kfree(priv->trts); + kfree(priv->arts); + kfree(priv); + return 0; +} + +static const struct acpi_device_id int3400_thermal_match[] = { + {"INT3400", 0}, + {} +}; + +MODULE_DEVICE_TABLE(acpi, int3400_thermal_match); + +static struct platform_driver int3400_thermal_driver = { + .probe = int3400_thermal_probe, + .remove = int3400_thermal_remove, + .driver = { + .name = "int3400 thermal", + .owner = THIS_MODULE, + .acpi_match_table = ACPI_PTR(int3400_thermal_match), + }, +}; + +module_platform_driver(int3400_thermal_driver); + +MODULE_DESCRIPTION("INT3400 Thermal driver"); +MODULE_AUTHOR("Zhang Rui "); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From c5738dddc01be49cf6712f886371177e8df36291 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 23 Mar 2014 23:34:26 +0800 Subject: Thermal: int3400 thermal: add capability to detect supporting UUIDs Signed-off-by: Zhang Rui --- drivers/thermal/int340x_thermal/int3400_thermal.c | 69 +++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c index 308c1850edee..65c63ba8b314 100644 --- a/drivers/thermal/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/int340x_thermal/int3400_thermal.c @@ -41,14 +41,79 @@ struct trt { u64 reverved4; }; +enum int3400_thermal_uuid { + INT3400_THERMAL_PASSIVE_1, + INT3400_THERMAL_PASSIVE_2, + INT3400_THERMAL_ACTIVE, + INT3400_THERMAL_CRITICAL, + INT3400_THERMAL_COOLING_MODE, + INT3400_THERMAL_MAXIMUM_UUID, +}; + +static u8 *int3400_thermal_uuids[INT3400_THERMAL_MAXIMUM_UUID] = { + "42A441D6-AE6A-462b-A84B-4A8CE79027D3", + "9E04115A-AE87-4D1C-9500-0F3E340BFE75", + "3A95C389-E4B8-4629-A526-C52C88626BAE", + "97C68AE7-15FA-499c-B8C9-5DA81D606E0A", + "16CAF1B7-DD38-40ed-B1C1-1B8A1913D531", +}; + struct int3400_thermal_priv { struct acpi_device *adev; int art_count; struct art *arts; int trt_count; struct trt *trts; + u8 uuid_bitmap; }; +static int int3400_thermal_get_uuids(struct int3400_thermal_priv *priv) +{ + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *obja, *objb; + int i, j; + int result = 0; + acpi_status status; + + status = acpi_evaluate_object(priv->adev->handle, "IDSP", NULL, &buf); + if (ACPI_FAILURE(status)) + return -ENODEV; + + obja = (union acpi_object *)buf.pointer; + if (obja->type != ACPI_TYPE_PACKAGE) { + result = -EINVAL; + goto end; + } + + for (i = 0; i < obja->package.count; i++) { + objb = &obja->package.elements[i]; + if (objb->type != ACPI_TYPE_BUFFER) { + result = -EINVAL; + goto end; + } + + /* UUID must be 16 bytes */ + if (objb->buffer.length != 16) { + result = -EINVAL; + goto end; + } + + for (j = 0; j < INT3400_THERMAL_MAXIMUM_UUID; j++) { + u8 uuid[16]; + + acpi_str_to_uuid(int3400_thermal_uuids[j], uuid); + if (!strncmp(uuid, objb->buffer.pointer, 16)) { + priv->uuid_bitmap |= (1 << j); + break; + } + } + } + +end: + kfree(buf.pointer); + return result; +} + static int parse_art(struct int3400_thermal_priv *priv) { acpi_handle handle = priv->adev->handle; @@ -193,6 +258,10 @@ static int int3400_thermal_probe(struct platform_device *pdev) priv->adev = adev; + result = int3400_thermal_get_uuids(priv); + if (result) + goto free_priv; + result = parse_art(priv); if (result) goto free_priv; -- cgit v1.2.3 From 0ab15365ff38c8433f4d0cc2d11a1184e2c991cf Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Sun, 23 Mar 2014 23:37:32 +0800 Subject: Thermal: int3400 thermal: register to thermal framework Signed-off-by: Zhang Rui --- drivers/thermal/Kconfig | 1 + drivers/thermal/int340x_thermal/int3400_thermal.c | 103 ++++++++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 6f5a87a8f19f..b34c5f54fc83 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -237,6 +237,7 @@ config INTEL_SOC_DTS_THERMAL config INT340X_THERMAL tristate "ACPI INT340X thermal drivers" depends on X86 && ACPI + select THERMAL_GOV_USER_SPACE help Newer laptops and tablets that use ACPI may have thermal sensors and other devices with thermal control capabilities outside the core diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c index 65c63ba8b314..9104b4f9381b 100644 --- a/drivers/thermal/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/int340x_thermal/int3400_thermal.c @@ -13,6 +13,7 @@ #include #include #include +#include struct art { acpi_handle source; @@ -60,6 +61,8 @@ static u8 *int3400_thermal_uuids[INT3400_THERMAL_MAXIMUM_UUID] = { struct int3400_thermal_priv { struct acpi_device *adev; + struct thermal_zone_device *thermal; + int mode; int art_count; struct art *arts; int trt_count; @@ -114,6 +117,36 @@ end: return result; } +static int int3400_thermal_run_osc(acpi_handle handle, + enum int3400_thermal_uuid uuid, bool enable) +{ + u32 ret, buf[2]; + acpi_status status; + int result = 0; + struct acpi_osc_context context = { + .uuid_str = int3400_thermal_uuids[uuid], + .rev = 1, + .cap.length = 8, + }; + + buf[OSC_QUERY_DWORD] = 0; + buf[OSC_SUPPORT_DWORD] = enable; + + context.cap.pointer = buf; + + status = acpi_run_osc(handle, &context); + if (ACPI_SUCCESS(status)) { + ret = *((u32 *)(context.ret.pointer + 4)); + if (ret != enable) + result = -EPERM; + } else + result = -EPERM; + + kfree(context.ret.pointer); + return result; +} + + static int parse_art(struct int3400_thermal_priv *priv) { acpi_handle handle = priv->adev->handle; @@ -243,6 +276,61 @@ end: return result; } +static int int3400_thermal_get_temp(struct thermal_zone_device *thermal, + unsigned long *temp) +{ + *temp = 20 * 1000; /* faked temp sensor with 20C */ + return 0; +} + +static int int3400_thermal_get_mode(struct thermal_zone_device *thermal, + enum thermal_device_mode *mode) +{ + struct int3400_thermal_priv *priv = thermal->devdata; + + if (!priv) + return -EINVAL; + + *mode = priv->mode; + + return 0; +} + +static int int3400_thermal_set_mode(struct thermal_zone_device *thermal, + enum thermal_device_mode mode) +{ + struct int3400_thermal_priv *priv = thermal->devdata; + bool enable; + int result = 0; + + if (!priv) + return -EINVAL; + + if (mode == THERMAL_DEVICE_ENABLED) + enable = true; + else if (mode == THERMAL_DEVICE_DISABLED) + enable = false; + else + return -EINVAL; + + if (enable != priv->mode) { + priv->mode = enable; + /* currently, only PASSIVE COOLING is supported */ + result = int3400_thermal_run_osc(priv->adev->handle, + INT3400_THERMAL_PASSIVE_1, enable); + } + return result; +} + +static struct thermal_zone_device_ops int3400_thermal_ops = { + .get_temp = int3400_thermal_get_temp, +}; + +static struct thermal_zone_params int3400_thermal_params = { + .governor_name = "user_space", + .no_hwmon = true, +}; + static int int3400_thermal_probe(struct platform_device *pdev) { struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); @@ -272,7 +360,21 @@ static int int3400_thermal_probe(struct platform_device *pdev) platform_set_drvdata(pdev, priv); + if (priv->uuid_bitmap & 1 << INT3400_THERMAL_PASSIVE_1) { + int3400_thermal_ops.get_mode = int3400_thermal_get_mode; + int3400_thermal_ops.set_mode = int3400_thermal_set_mode; + } + priv->thermal = thermal_zone_device_register("INT3400 Thermal", 0, 0, + priv, &int3400_thermal_ops, + &int3400_thermal_params, 0, 0); + if (IS_ERR(priv->thermal)) { + result = PTR_ERR(priv->thermal); + goto free_trt; + } + return 0; +free_trt: + kfree(priv->trts); free_art: kfree(priv->arts); free_priv: @@ -284,6 +386,7 @@ static int int3400_thermal_remove(struct platform_device *pdev) { struct int3400_thermal_priv *priv = platform_get_drvdata(pdev); + thermal_zone_device_unregister(priv->thermal); kfree(priv->trts); kfree(priv->arts); kfree(priv); -- cgit v1.2.3 From bd6ad24342c3bc0476ba3f0b647c5ba0a71b45b4 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 19 Nov 2013 14:25:58 +0800 Subject: ACPI / fan: remove unused macro The _COMPONENT, ACPI_MODULE_NAME(name) and ACPI_FAN_FILE_STATE are not used anywhere so remove them. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/acpi/fan.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index 8acf53e62966..562d5f3a1bb5 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -34,10 +34,6 @@ #define PREFIX "ACPI: " #define ACPI_FAN_CLASS "fan" -#define ACPI_FAN_FILE_STATE "state" - -#define _COMPONENT ACPI_FAN_COMPONENT -ACPI_MODULE_NAME("fan"); MODULE_AUTHOR("Paul Diefenbaugh"); MODULE_DESCRIPTION("ACPI Fan Driver"); -- cgit v1.2.3 From 8dd41f78adebb57909cccb0272e74c79e38b5238 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 19 Nov 2013 15:21:24 +0800 Subject: ACPI / fan: remove no need check for device pointer The device pointer will not be NULL in the PM callback and ACPI driver's add/remove callback, so checking NULL for them isn't necessary. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/acpi/fan.c | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index 562d5f3a1bb5..df861bbc73cc 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -131,9 +131,6 @@ static int acpi_fan_add(struct acpi_device *device) int result = 0; struct thermal_cooling_device *cdev; - if (!device) - return -EINVAL; - strcpy(acpi_device_name(device), "Fan"); strcpy(acpi_device_class(device), ACPI_FAN_CLASS); @@ -177,14 +174,7 @@ end: static int acpi_fan_remove(struct acpi_device *device) { - struct thermal_cooling_device *cdev; - - if (!device) - return -EINVAL; - - cdev = acpi_driver_data(device); - if (!cdev) - return -EINVAL; + struct thermal_cooling_device *cdev = acpi_driver_data(device); sysfs_remove_link(&device->dev.kobj, "thermal_cooling"); sysfs_remove_link(&cdev->device.kobj, "device"); @@ -196,9 +186,6 @@ static int acpi_fan_remove(struct acpi_device *device) #ifdef CONFIG_PM_SLEEP static int acpi_fan_suspend(struct device *dev) { - if (!dev) - return -EINVAL; - acpi_bus_set_power(to_acpi_device(dev)->handle, ACPI_STATE_D0); return AE_OK; @@ -208,9 +195,6 @@ static int acpi_fan_resume(struct device *dev) { int result; - if (!dev) - return -EINVAL; - result = acpi_bus_update_power(to_acpi_device(dev)->handle, NULL); if (result) printk(KERN_ERR PREFIX "Error updating fan power state\n"); -- cgit v1.2.3 From 2bb3a2bf9939f3361e25045f4ef7b136b864c3b8 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 19 Nov 2013 15:43:52 +0800 Subject: ACPI / fan: use acpi_device_xxx_power instead of acpi_bus equivelant When we have the acpi_device pointer, there is no need to pass the device's handle to the acpi_bus_xxx_power functions to get/set/update the device's power state, instead, use the acpi_device_xxx_power functions directly. To make this happen for fan module, export acpi_device_update_power. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/acpi/device_pm.c | 1 + drivers/acpi/fan.c | 10 +++++----- drivers/acpi/internal.h | 2 -- include/acpi/acpi_bus.h | 1 + 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 67075f800e34..91775475e367 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -343,6 +343,7 @@ int acpi_device_update_power(struct acpi_device *device, int *state_p) return 0; } +EXPORT_SYMBOL_GPL(acpi_device_update_power); int acpi_bus_update_power(acpi_handle handle, int *state_p) { diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index df861bbc73cc..fff9696bea25 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -92,7 +92,7 @@ static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long if (!device) return -EINVAL; - result = acpi_bus_update_power(device->handle, &acpi_state); + result = acpi_device_update_power(device, &acpi_state); if (result) return result; @@ -110,7 +110,7 @@ fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) if (!device || (state != 0 && state != 1)) return -EINVAL; - result = acpi_bus_set_power(device->handle, + result = acpi_device_set_power(device, state ? ACPI_STATE_D0 : ACPI_STATE_D3_COLD); return result; @@ -134,7 +134,7 @@ static int acpi_fan_add(struct acpi_device *device) strcpy(acpi_device_name(device), "Fan"); strcpy(acpi_device_class(device), ACPI_FAN_CLASS); - result = acpi_bus_update_power(device->handle, NULL); + result = acpi_device_update_power(device, NULL); if (result) { printk(KERN_ERR PREFIX "Setting initial power state\n"); goto end; @@ -186,7 +186,7 @@ static int acpi_fan_remove(struct acpi_device *device) #ifdef CONFIG_PM_SLEEP static int acpi_fan_suspend(struct device *dev) { - acpi_bus_set_power(to_acpi_device(dev)->handle, ACPI_STATE_D0); + acpi_device_set_power(to_acpi_device(dev), ACPI_STATE_D0); return AE_OK; } @@ -195,7 +195,7 @@ static int acpi_fan_resume(struct device *dev) { int result; - result = acpi_bus_update_power(to_acpi_device(dev)->handle, NULL); + result = acpi_device_update_power(to_acpi_device(dev), NULL); if (result) printk(KERN_ERR PREFIX "Error updating fan power state\n"); diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index f221d1eb594a..447f6d679b29 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -104,8 +104,6 @@ int acpi_power_get_inferred_state(struct acpi_device *device, int *state); int acpi_power_on_resources(struct acpi_device *device, int state); int acpi_power_transition(struct acpi_device *device, int state); -int acpi_device_update_power(struct acpi_device *device, int *state_p); - int acpi_wakeup_device_init(void); #ifdef CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index bcfd808b1098..6ca32812f3da 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -435,6 +435,7 @@ int acpi_device_set_power(struct acpi_device *device, int state); int acpi_bus_init_power(struct acpi_device *device); int acpi_device_fix_up_power(struct acpi_device *device); int acpi_bus_update_power(acpi_handle handle, int *state_p); +int acpi_device_update_power(struct acpi_device *device, int *state_p); bool acpi_bus_power_manageable(acpi_handle handle); #ifdef CONFIG_PM -- cgit v1.2.3 From 19593a1fb1f6718406afca5b867dab184289d406 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 19 Nov 2013 16:59:20 +0800 Subject: ACPI / fan: convert to platform driver Convert ACPI fan driver to a platform driver for the purpose of phasing out ACPI bus. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/acpi/fan.c | 62 ++++++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 34 deletions(-) diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index fff9696bea25..8a5b450576df 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -30,17 +30,14 @@ #include #include #include - -#define PREFIX "ACPI: " - -#define ACPI_FAN_CLASS "fan" +#include MODULE_AUTHOR("Paul Diefenbaugh"); MODULE_DESCRIPTION("ACPI Fan Driver"); MODULE_LICENSE("GPL"); -static int acpi_fan_add(struct acpi_device *device); -static int acpi_fan_remove(struct acpi_device *device); +static int acpi_fan_probe(struct platform_device *pdev); +static int acpi_fan_remove(struct platform_device *pdev); static const struct acpi_device_id fan_device_ids[] = { {"PNP0C0B", 0}, @@ -62,15 +59,14 @@ static struct dev_pm_ops acpi_fan_pm = { #define FAN_PM_OPS_PTR NULL #endif -static struct acpi_driver acpi_fan_driver = { - .name = "fan", - .class = ACPI_FAN_CLASS, - .ids = fan_device_ids, - .ops = { - .add = acpi_fan_add, - .remove = acpi_fan_remove, - }, - .drv.pm = FAN_PM_OPS_PTR, +static struct platform_driver acpi_fan_driver = { + .probe = acpi_fan_probe, + .remove = acpi_fan_remove, + .driver = { + .name = "acpi-fan", + .acpi_match_table = fan_device_ids, + .pm = FAN_PM_OPS_PTR, + }, }; /* thermal cooling device callbacks */ @@ -126,17 +122,15 @@ static const struct thermal_cooling_device_ops fan_cooling_ops = { Driver Interface -------------------------------------------------------------------------- */ -static int acpi_fan_add(struct acpi_device *device) +static int acpi_fan_probe(struct platform_device *pdev) { int result = 0; struct thermal_cooling_device *cdev; - - strcpy(acpi_device_name(device), "Fan"); - strcpy(acpi_device_class(device), ACPI_FAN_CLASS); + struct acpi_device *device = ACPI_COMPANION(&pdev->dev); result = acpi_device_update_power(device, NULL); if (result) { - printk(KERN_ERR PREFIX "Setting initial power state\n"); + dev_err(&pdev->dev, "Setting initial power state\n"); goto end; } @@ -147,24 +141,24 @@ static int acpi_fan_add(struct acpi_device *device) goto end; } - dev_dbg(&device->dev, "registered as cooling_device%d\n", cdev->id); + dev_dbg(&pdev->dev, "registered as cooling_device%d\n", cdev->id); - device->driver_data = cdev; - result = sysfs_create_link(&device->dev.kobj, + platform_set_drvdata(pdev, cdev); + result = sysfs_create_link(&pdev->dev.kobj, &cdev->device.kobj, "thermal_cooling"); if (result) - dev_err(&device->dev, "Failed to create sysfs link " + dev_err(&pdev->dev, "Failed to create sysfs link " "'thermal_cooling'\n"); result = sysfs_create_link(&cdev->device.kobj, - &device->dev.kobj, + &pdev->dev.kobj, "device"); if (result) - dev_err(&device->dev, "Failed to create sysfs link " + dev_err(&pdev->dev, "Failed to create sysfs link " "'device'\n"); - printk(KERN_INFO PREFIX "%s [%s] (%s)\n", + dev_info(&pdev->dev, "%s [%s] (%s)\n", acpi_device_name(device), acpi_device_bid(device), !device->power.state ? "on" : "off"); @@ -172,11 +166,11 @@ end: return result; } -static int acpi_fan_remove(struct acpi_device *device) +static int acpi_fan_remove(struct platform_device *pdev) { - struct thermal_cooling_device *cdev = acpi_driver_data(device); + struct thermal_cooling_device *cdev = platform_get_drvdata(pdev); - sysfs_remove_link(&device->dev.kobj, "thermal_cooling"); + sysfs_remove_link(&pdev->dev.kobj, "thermal_cooling"); sysfs_remove_link(&cdev->device.kobj, "device"); thermal_cooling_device_unregister(cdev); @@ -186,7 +180,7 @@ static int acpi_fan_remove(struct acpi_device *device) #ifdef CONFIG_PM_SLEEP static int acpi_fan_suspend(struct device *dev) { - acpi_device_set_power(to_acpi_device(dev), ACPI_STATE_D0); + acpi_device_set_power(ACPI_COMPANION(dev), ACPI_STATE_D0); return AE_OK; } @@ -195,12 +189,12 @@ static int acpi_fan_resume(struct device *dev) { int result; - result = acpi_device_update_power(to_acpi_device(dev), NULL); + result = acpi_device_update_power(ACPI_COMPANION(dev), NULL); if (result) - printk(KERN_ERR PREFIX "Error updating fan power state\n"); + dev_err(dev, "Error updating fan power state\n"); return result; } #endif -module_acpi_driver(acpi_fan_driver); +module_platform_driver(acpi_fan_driver); -- cgit v1.2.3 From 9519a6356cbf63b1f22a7a208385dc56092c8b7d Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 1 Apr 2014 15:50:27 +0800 Subject: ACPI / Fan: add ACPI 4.0 style fan support This patch adds support for ACPI 4.0 style fan, lacking part is: no support for 'Low Speed Notification Support', 'Fine Grain Control' is not used yet. It's not clear what to do on suspend/resume callback for 4.0 style ACPI fan, so it does nothing for now. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/acpi/fan.c | 268 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 241 insertions(+), 27 deletions(-) diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index 8a5b450576df..f7d1c8027736 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -31,6 +31,7 @@ #include #include #include +#include MODULE_AUTHOR("Paul Diefenbaugh"); MODULE_DESCRIPTION("ACPI Fan Driver"); @@ -59,6 +60,29 @@ static struct dev_pm_ops acpi_fan_pm = { #define FAN_PM_OPS_PTR NULL #endif +struct acpi_fan_fps { + u64 control; + u64 trip_point; + u64 speed; + u64 noise_level; + u64 power; +}; + +struct acpi_fan_fif { + u64 revision; + u64 fine_grain_ctrl; + u64 step_size; + u64 low_speed_notification; +}; + +struct acpi_fan { + bool acpi4; + struct acpi_fan_fif fif; + struct acpi_fan_fps *fps; + int fps_count; + struct thermal_cooling_device *cdev; +}; + static struct platform_driver acpi_fan_driver = { .probe = acpi_fan_probe, .remove = acpi_fan_remove, @@ -73,21 +97,62 @@ static struct platform_driver acpi_fan_driver = { static int fan_get_max_state(struct thermal_cooling_device *cdev, unsigned long *state) { - /* ACPI fan device only support two states: ON/OFF */ - *state = 1; + struct acpi_device *device = cdev->devdata; + struct acpi_fan *fan = acpi_driver_data(device); + + if (fan->acpi4) + *state = fan->fps_count - 1; + else + *state = 1; return 0; } -static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long - *state) +static int fan_get_state_acpi4(struct acpi_device *device, unsigned long *state) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_fan *fan = acpi_driver_data(device); + union acpi_object *obj; + acpi_status status; + int control, i; + + status = acpi_evaluate_object(device->handle, "_FST", NULL, &buffer); + if (ACPI_FAILURE(status)) { + dev_err(&device->dev, "Get fan state failed\n"); + return status; + } + + obj = buffer.pointer; + if (!obj || obj->type != ACPI_TYPE_PACKAGE || + obj->package.count != 3 || + obj->package.elements[1].type != ACPI_TYPE_INTEGER) { + dev_err(&device->dev, "Invalid _FST data\n"); + status = -EINVAL; + goto err; + } + + control = obj->package.elements[1].integer.value; + for (i = 0; i < fan->fps_count; i++) { + if (control == fan->fps[i].control) + break; + } + if (i == fan->fps_count) { + dev_dbg(&device->dev, "Invalid control value returned\n"); + status = -EINVAL; + goto err; + } + + *state = i; + +err: + kfree(obj); + return status; +} + +static int fan_get_state(struct acpi_device *device, unsigned long *state) { - struct acpi_device *device = cdev->devdata; int result; int acpi_state = ACPI_STATE_D0; - if (!device) - return -EINVAL; - result = acpi_device_update_power(device, &acpi_state); if (result) return result; @@ -97,21 +162,57 @@ static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long return 0; } -static int -fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) +static int fan_get_cur_state(struct thermal_cooling_device *cdev, unsigned long + *state) { struct acpi_device *device = cdev->devdata; - int result; + struct acpi_fan *fan = acpi_driver_data(device); + + if (fan->acpi4) + return fan_get_state_acpi4(device, state); + else + return fan_get_state(device, state); +} - if (!device || (state != 0 && state != 1)) +static int fan_set_state(struct acpi_device *device, unsigned long state) +{ + if (state != 0 && state != 1) return -EINVAL; - result = acpi_device_set_power(device, - state ? ACPI_STATE_D0 : ACPI_STATE_D3_COLD); + return acpi_device_set_power(device, + state ? ACPI_STATE_D0 : ACPI_STATE_D3_COLD); +} - return result; +static int fan_set_state_acpi4(struct acpi_device *device, unsigned long state) +{ + struct acpi_fan *fan = acpi_driver_data(device); + acpi_status status; + + if (state >= fan->fps_count) + return -EINVAL; + + status = acpi_execute_simple_method(device->handle, "_FSL", + fan->fps[state].control); + if (ACPI_FAILURE(status)) { + dev_dbg(&device->dev, "Failed to set state by _FSL\n"); + return status; + } + + return 0; } +static int +fan_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) +{ + struct acpi_device *device = cdev->devdata; + struct acpi_fan *fan = acpi_driver_data(device); + + if (fan->acpi4) + return fan_set_state_acpi4(device, state); + else + return fan_set_state(device, state); + } + static const struct thermal_cooling_device_ops fan_cooling_ops = { .get_max_state = fan_get_max_state, .get_cur_state = fan_get_cur_state, @@ -122,16 +223,125 @@ static const struct thermal_cooling_device_ops fan_cooling_ops = { Driver Interface -------------------------------------------------------------------------- */ +static bool acpi_fan_is_acpi4(struct acpi_device *device) +{ + return acpi_has_method(device->handle, "_FIF") && + acpi_has_method(device->handle, "_FPS") && + acpi_has_method(device->handle, "_FSL") && + acpi_has_method(device->handle, "_FST"); +} + +static int acpi_fan_get_fif(struct acpi_device *device) +{ + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_fan *fan = acpi_driver_data(device); + struct acpi_buffer format = { sizeof("NNNN"), "NNNN" }; + struct acpi_buffer fif = { sizeof(fan->fif), &fan->fif }; + union acpi_object *obj; + acpi_status status; + + status = acpi_evaluate_object(device->handle, "_FIF", NULL, &buffer); + if (ACPI_FAILURE(status)) + return status; + + obj = buffer.pointer; + if (!obj || obj->type != ACPI_TYPE_PACKAGE) { + dev_err(&device->dev, "Invalid _FIF data\n"); + status = -EINVAL; + goto err; + } + + status = acpi_extract_package(obj, &format, &fif); + if (ACPI_FAILURE(status)) { + dev_err(&device->dev, "Invalid _FIF element\n"); + status = -EINVAL; + } + +err: + kfree(obj); + return status; +} + +static int acpi_fan_speed_cmp(const void *a, const void *b) +{ + const struct acpi_fan_fps *fps1 = a; + const struct acpi_fan_fps *fps2 = b; + return fps1->speed - fps2->speed; +} + +static int acpi_fan_get_fps(struct acpi_device *device) +{ + struct acpi_fan *fan = acpi_driver_data(device); + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *obj; + acpi_status status; + int i; + + status = acpi_evaluate_object(device->handle, "_FPS", NULL, &buffer); + if (ACPI_FAILURE(status)) + return status; + + obj = buffer.pointer; + if (!obj || obj->type != ACPI_TYPE_PACKAGE || obj->package.count < 2) { + dev_err(&device->dev, "Invalid _FPS data\n"); + status = -EINVAL; + goto err; + } + + fan->fps_count = obj->package.count - 1; /* minus revision field */ + fan->fps = devm_kzalloc(&device->dev, + fan->fps_count * sizeof(struct acpi_fan_fps), + GFP_KERNEL); + if (!fan->fps) { + dev_err(&device->dev, "Not enough memory\n"); + status = -ENOMEM; + goto err; + } + for (i = 0; i < fan->fps_count; i++) { + struct acpi_buffer format = { sizeof("NNNNN"), "NNNNN" }; + struct acpi_buffer fps = { sizeof(fan->fps[i]), &fan->fps[i] }; + status = acpi_extract_package(&obj->package.elements[i + 1], + &format, &fps); + if (ACPI_FAILURE(status)) { + dev_err(&device->dev, "Invalid _FPS element\n"); + break; + } + } + + /* sort the state array according to fan speed in increase order */ + sort(fan->fps, fan->fps_count, sizeof(*fan->fps), + acpi_fan_speed_cmp, NULL); + +err: + kfree(obj); + return status; +} + static int acpi_fan_probe(struct platform_device *pdev) { int result = 0; struct thermal_cooling_device *cdev; + struct acpi_fan *fan; struct acpi_device *device = ACPI_COMPANION(&pdev->dev); - result = acpi_device_update_power(device, NULL); - if (result) { - dev_err(&pdev->dev, "Setting initial power state\n"); - goto end; + fan = devm_kzalloc(&pdev->dev, sizeof(*fan), GFP_KERNEL); + if (!fan) { + dev_err(&device->dev, "No memory for fan\n"); + return -ENOMEM; + } + device->driver_data = fan; + platform_set_drvdata(pdev, fan); + + if (acpi_fan_is_acpi4(device)) { + if (acpi_fan_get_fif(device) || acpi_fan_get_fps(device)) + goto end; + fan->acpi4 = true; + } else { + result = acpi_device_update_power(device, NULL); + if (result) { + dev_err(&device->dev, "Setting initial power state\n"); + goto end; + } } cdev = thermal_cooling_device_register("Fan", device, @@ -143,7 +353,7 @@ static int acpi_fan_probe(struct platform_device *pdev) dev_dbg(&pdev->dev, "registered as cooling_device%d\n", cdev->id); - platform_set_drvdata(pdev, cdev); + fan->cdev = cdev; result = sysfs_create_link(&pdev->dev.kobj, &cdev->device.kobj, "thermal_cooling"); @@ -158,21 +368,17 @@ static int acpi_fan_probe(struct platform_device *pdev) dev_err(&pdev->dev, "Failed to create sysfs link " "'device'\n"); - dev_info(&pdev->dev, "%s [%s] (%s)\n", - acpi_device_name(device), acpi_device_bid(device), - !device->power.state ? "on" : "off"); - end: return result; } static int acpi_fan_remove(struct platform_device *pdev) { - struct thermal_cooling_device *cdev = platform_get_drvdata(pdev); + struct acpi_fan *fan = platform_get_drvdata(pdev); sysfs_remove_link(&pdev->dev.kobj, "thermal_cooling"); - sysfs_remove_link(&cdev->device.kobj, "device"); - thermal_cooling_device_unregister(cdev); + sysfs_remove_link(&fan->cdev->device.kobj, "device"); + thermal_cooling_device_unregister(fan->cdev); return 0; } @@ -180,6 +386,10 @@ static int acpi_fan_remove(struct platform_device *pdev) #ifdef CONFIG_PM_SLEEP static int acpi_fan_suspend(struct device *dev) { + struct acpi_fan *fan = dev_get_drvdata(dev); + if (fan->acpi4) + return 0; + acpi_device_set_power(ACPI_COMPANION(dev), ACPI_STATE_D0); return AE_OK; @@ -188,6 +398,10 @@ static int acpi_fan_suspend(struct device *dev) static int acpi_fan_resume(struct device *dev) { int result; + struct acpi_fan *fan = dev_get_drvdata(dev); + + if (fan->acpi4) + return 0; result = acpi_device_update_power(ACPI_COMPANION(dev), NULL); if (result) -- cgit v1.2.3 From d806c6e9cdfd6c5663687ec7109e151c0ff66639 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 1 Apr 2014 15:54:05 +0800 Subject: ACPI / Fan: support INT3404 thermal device INT3404 ACPI object follows the ACPI 5.0 fan object definition as described in section 11.3 of the ACPI 5.0 Specification. Thus we can reuse the ACPI fan driver for INT3404 ACPI object. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/acpi/fan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/fan.c b/drivers/acpi/fan.c index f7d1c8027736..e007c4987bea 100644 --- a/drivers/acpi/fan.c +++ b/drivers/acpi/fan.c @@ -42,6 +42,7 @@ static int acpi_fan_remove(struct platform_device *pdev); static const struct acpi_device_id fan_device_ids[] = { {"PNP0C0B", 0}, + {"INT3404", 0}, {"", 0}, }; MODULE_DEVICE_TABLE(acpi, fan_device_ids); -- cgit v1.2.3 From 7b83fd9d91a411158f72d36958103c708c3b5a86 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Tue, 25 Mar 2014 10:40:09 +0800 Subject: Thermal: move the KELVIN_TO_MILLICELSIUS macro to thermal.h This macro can be used by other component so move it to a common header, but in a slightly different way: define two macros, one macro with an offset and the other doesn't. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/acpi/thermal.c | 18 +++++++++--------- include/linux/thermal.h | 2 ++ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c index 112817e963e0..d24fa1964eb8 100644 --- a/drivers/acpi/thermal.c +++ b/drivers/acpi/thermal.c @@ -528,7 +528,6 @@ static void acpi_thermal_check(void *data) } /* sys I/F for generic thermal sysfs support */ -#define KELVIN_TO_MILLICELSIUS(t, off) (((t) - (off)) * 100) static int thermal_get_temp(struct thermal_zone_device *thermal, unsigned long *temp) @@ -543,7 +542,8 @@ static int thermal_get_temp(struct thermal_zone_device *thermal, if (result) return result; - *temp = KELVIN_TO_MILLICELSIUS(tz->temperature, tz->kelvin_offset); + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(tz->temperature, + tz->kelvin_offset); return 0; } @@ -647,7 +647,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.critical.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.critical.temperature, tz->kelvin_offset); return 0; @@ -657,7 +657,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.hot.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.hot.temperature, tz->kelvin_offset); return 0; @@ -667,7 +667,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, if (tz->trips.passive.flags.valid) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.passive.temperature, tz->kelvin_offset); return 0; @@ -678,7 +678,7 @@ static int thermal_get_trip_temp(struct thermal_zone_device *thermal, for (i = 0; i < ACPI_THERMAL_MAX_ACTIVE && tz->trips.active[i].flags.valid; i++) { if (!trip) { - *temp = KELVIN_TO_MILLICELSIUS( + *temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.active[i].temperature, tz->kelvin_offset); return 0; @@ -694,7 +694,7 @@ static int thermal_get_crit_temp(struct thermal_zone_device *thermal, struct acpi_thermal *tz = thermal->devdata; if (tz->trips.critical.flags.valid) { - *temperature = KELVIN_TO_MILLICELSIUS( + *temperature = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( tz->trips.critical.temperature, tz->kelvin_offset); return 0; @@ -714,8 +714,8 @@ static int thermal_get_trend(struct thermal_zone_device *thermal, if (type == THERMAL_TRIP_ACTIVE) { unsigned long trip_temp; - unsigned long temp = KELVIN_TO_MILLICELSIUS(tz->temperature, - tz->kelvin_offset); + unsigned long temp = DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET( + tz->temperature, tz->kelvin_offset); if (thermal_get_trip_temp(thermal, trip, &trip_temp)) return -EINVAL; diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 0305cde21a74..79ce6b94884a 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -44,6 +44,8 @@ #define KELVIN_TO_CELSIUS(t) (long)(((long)t-2732 >= 0) ? \ ((long)t-2732+5)/10 : ((long)t-2732-5)/10) #define CELSIUS_TO_KELVIN(t) ((t)*10+2732) +#define DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, off) (((t) - (off)) * 100) +#define DECI_KELVIN_TO_MILLICELSIUS(t) DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, 2732) /* Adding event notification support elements */ #define THERMAL_GENL_FAMILY_NAME "thermal_event" -- cgit v1.2.3 From 77e337c6e23e3b9d22e09ffec202a80f755a54c2 Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 3 Sep 2014 15:13:02 +0800 Subject: Thermal: introduce INT3402 thermal driver ACPI INT3402 device object could report temperature for the memory module. To expose such information to user space, a thermal zone device is registered for it so that the thermal sysfs interface can expose such information for userspace to use. Signed-off-by: Aaron Lu Signed-off-by: Zhang Rui --- drivers/thermal/int340x_thermal/Makefile | 1 + drivers/thermal/int340x_thermal/int3402_thermal.c | 242 ++++++++++++++++++++++ include/linux/thermal.h | 2 + 3 files changed, 245 insertions(+) create mode 100644 drivers/thermal/int340x_thermal/int3402_thermal.c diff --git a/drivers/thermal/int340x_thermal/Makefile b/drivers/thermal/int340x_thermal/Makefile index e10a53bcefe7..67c98fd24200 100644 --- a/drivers/thermal/int340x_thermal/Makefile +++ b/drivers/thermal/int340x_thermal/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_INT340X_THERMAL) += int3400_thermal.o +obj-$(CONFIG_INT340X_THERMAL) += int3402_thermal.o diff --git a/drivers/thermal/int340x_thermal/int3402_thermal.c b/drivers/thermal/int340x_thermal/int3402_thermal.c new file mode 100644 index 000000000000..a5d08c14ba24 --- /dev/null +++ b/drivers/thermal/int340x_thermal/int3402_thermal.c @@ -0,0 +1,242 @@ +/* + * INT3402 thermal driver for memory temperature reporting + * + * Copyright (C) 2014, Intel Corporation + * Authors: Aaron Lu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include + +#define ACPI_ACTIVE_COOLING_MAX_NR 10 + +struct active_trip { + unsigned long temp; + int id; + bool valid; +}; + +struct int3402_thermal_data { + unsigned long *aux_trips; + int aux_trip_nr; + unsigned long psv_temp; + int psv_trip_id; + unsigned long crt_temp; + int crt_trip_id; + unsigned long hot_temp; + int hot_trip_id; + struct active_trip act_trips[ACPI_ACTIVE_COOLING_MAX_NR]; + acpi_handle *handle; +}; + +static int int3402_thermal_get_zone_temp(struct thermal_zone_device *zone, + unsigned long *temp) +{ + struct int3402_thermal_data *d = zone->devdata; + unsigned long long tmp; + acpi_status status; + + status = acpi_evaluate_integer(d->handle, "_TMP", NULL, &tmp); + if (ACPI_FAILURE(status)) + return -ENODEV; + + /* _TMP returns the temperature in tenths of degrees Kelvin */ + *temp = DECI_KELVIN_TO_MILLICELSIUS(tmp); + + return 0; +} + +static int int3402_thermal_get_trip_temp(struct thermal_zone_device *zone, + int trip, unsigned long *temp) +{ + struct int3402_thermal_data *d = zone->devdata; + int i; + + if (trip < d->aux_trip_nr) + *temp = d->aux_trips[trip]; + else if (trip == d->crt_trip_id) + *temp = d->crt_temp; + else if (trip == d->psv_trip_id) + *temp = d->psv_temp; + else if (trip == d->hot_trip_id) + *temp = d->hot_temp; + else { + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + if (d->act_trips[i].valid && + d->act_trips[i].id == trip) { + *temp = d->act_trips[i].temp; + break; + } + } + if (i == ACPI_ACTIVE_COOLING_MAX_NR) + return -EINVAL; + } + return 0; +} + +static int int3402_thermal_get_trip_type(struct thermal_zone_device *zone, + int trip, enum thermal_trip_type *type) +{ + struct int3402_thermal_data *d = zone->devdata; + int i; + + if (trip < d->aux_trip_nr) + *type = THERMAL_TRIP_PASSIVE; + else if (trip == d->crt_trip_id) + *type = THERMAL_TRIP_CRITICAL; + else if (trip == d->hot_trip_id) + *type = THERMAL_TRIP_HOT; + else if (trip == d->psv_trip_id) + *type = THERMAL_TRIP_PASSIVE; + else { + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + if (d->act_trips[i].valid && + d->act_trips[i].id == trip) { + *type = THERMAL_TRIP_ACTIVE; + break; + } + } + if (i == ACPI_ACTIVE_COOLING_MAX_NR) + return -EINVAL; + } + return 0; +} + +static int int3402_thermal_set_trip_temp(struct thermal_zone_device *zone, int trip, + unsigned long temp) +{ + struct int3402_thermal_data *d = zone->devdata; + acpi_status status; + char name[10]; + + snprintf(name, sizeof(name), "PAT%d", trip); + status = acpi_execute_simple_method(d->handle, name, + MILLICELSIUS_TO_DECI_KELVIN(temp)); + if (ACPI_FAILURE(status)) + return -EIO; + + d->aux_trips[trip] = temp; + return 0; +} + +static struct thermal_zone_device_ops int3402_thermal_zone_ops = { + .get_temp = int3402_thermal_get_zone_temp, + .get_trip_temp = int3402_thermal_get_trip_temp, + .get_trip_type = int3402_thermal_get_trip_type, + .set_trip_temp = int3402_thermal_set_trip_temp, +}; + +static struct thermal_zone_params int3402_thermal_params = { + .governor_name = "user_space", + .no_hwmon = true, +}; + +static int int3402_thermal_get_temp(acpi_handle handle, char *name, + unsigned long *temp) +{ + unsigned long long r; + acpi_status status; + + status = acpi_evaluate_integer(handle, name, NULL, &r); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLICELSIUS(r); + return 0; +} + +static int int3402_thermal_probe(struct platform_device *pdev) +{ + struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); + struct int3402_thermal_data *d; + struct thermal_zone_device *zone; + acpi_status status; + unsigned long long trip_cnt; + int trip_mask = 0, i; + + if (!acpi_has_method(adev->handle, "_TMP")) + return -ENODEV; + + d = devm_kzalloc(&pdev->dev, sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + status = acpi_evaluate_integer(adev->handle, "PATC", NULL, &trip_cnt); + if (ACPI_FAILURE(status)) + trip_cnt = 0; + else { + d->aux_trips = devm_kzalloc(&pdev->dev, + sizeof(*d->aux_trips) * trip_cnt, GFP_KERNEL); + if (!d->aux_trips) + return -ENOMEM; + trip_mask = trip_cnt - 1; + d->handle = adev->handle; + d->aux_trip_nr = trip_cnt; + } + + d->crt_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_CRT", &d->crt_temp)) + d->crt_trip_id = trip_cnt++; + d->hot_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_HOT", &d->hot_temp)) + d->hot_trip_id = trip_cnt++; + d->psv_trip_id = -1; + if (!int3402_thermal_get_temp(adev->handle, "_PSV", &d->psv_temp)) + d->psv_trip_id = trip_cnt++; + for (i = 0; i < ACPI_ACTIVE_COOLING_MAX_NR; i++) { + char name[5] = { '_', 'A', 'C', '0' + i, '\0' }; + if (int3402_thermal_get_temp(adev->handle, name, + &d->act_trips[i].temp)) + break; + d->act_trips[i].id = trip_cnt++; + d->act_trips[i].valid = true; + } + + zone = thermal_zone_device_register(acpi_device_bid(adev), trip_cnt, + trip_mask, d, + &int3402_thermal_zone_ops, + &int3402_thermal_params, + 0, 0); + if (IS_ERR(zone)) + return PTR_ERR(zone); + platform_set_drvdata(pdev, zone); + + return 0; +} + +static int int3402_thermal_remove(struct platform_device *pdev) +{ + struct thermal_zone_device *zone = platform_get_drvdata(pdev); + + thermal_zone_device_unregister(zone); + return 0; +} + +static const struct acpi_device_id int3402_thermal_match[] = { + {"INT3402", 0}, + {} +}; + +MODULE_DEVICE_TABLE(acpi, int3402_thermal_match); + +static struct platform_driver int3402_thermal_driver = { + .probe = int3402_thermal_probe, + .remove = int3402_thermal_remove, + .driver = { + .name = "int3402 thermal", + .owner = THIS_MODULE, + .acpi_match_table = int3402_thermal_match, + }, +}; + +module_platform_driver(int3402_thermal_driver); + +MODULE_DESCRIPTION("INT3402 Thermal driver"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 79ce6b94884a..ef90838b36a0 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -46,6 +46,8 @@ #define CELSIUS_TO_KELVIN(t) ((t)*10+2732) #define DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, off) (((t) - (off)) * 100) #define DECI_KELVIN_TO_MILLICELSIUS(t) DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, 2732) +#define MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, off) (((t) / 100) + (off)) +#define MILLICELSIUS_TO_DECI_KELVIN(t) MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, 2732) /* Adding event notification support elements */ #define THERMAL_GENL_FAMILY_NAME "thermal_event" -- cgit v1.2.3 From 1028ccf560b97adbf272381a61a67e17d44d1054 Mon Sep 17 00:00:00 2001 From: Romeo Cane Date: Thu, 2 Oct 2014 15:41:39 +0100 Subject: powerpc: Fix sys_call_table declaration to enable syscall tracing Declaring sys_call_table as a pointer causes the compiler to generate the wrong lookup code in arch_syscall_addr(). : lis r9,-16384 rlwinm r3,r3,2,0,29 - lwz r11,30640(r9) - lwzx r3,r11,r3 + addi r9,r9,30640 + lwzx r3,r9,r3 blr The actual sys_call_table symbol, declared in assembler, is an array. If we lie about that to the compiler we get the wrong code generated, as above. This definition seems only to be used by the syscall tracing code in kernel/trace/trace_syscalls.c. With this patch I can successfully use the syscall tracepoints: bash-3815 [002] .... 333.239082: sys_write -> 0x2 bash-3815 [002] .... 333.239087: sys_dup2(oldfd: a, newfd: 1) bash-3815 [002] .... 333.239088: sys_dup2 -> 0x1 bash-3815 [002] .... 333.239092: sys_fcntl(fd: a, cmd: 1, arg: 0) bash-3815 [002] .... 333.239093: sys_fcntl -> 0x1 bash-3815 [002] .... 333.239094: sys_close(fd: a) bash-3815 [002] .... 333.239094: sys_close -> 0x0 Signed-off-by: Romeo Cane Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/syscall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h index b54b2add07be..528ba9d8eed5 100644 --- a/arch/powerpc/include/asm/syscall.h +++ b/arch/powerpc/include/asm/syscall.h @@ -17,7 +17,7 @@ /* ftrace syscalls requires exporting the sys_call_table */ #ifdef CONFIG_FTRACE_SYSCALLS -extern const unsigned long *sys_call_table; +extern const unsigned long sys_call_table[]; #endif /* CONFIG_FTRACE_SYSCALLS */ static inline long syscall_get_nr(struct task_struct *task, -- cgit v1.2.3 From c675c7db629bd1abf4a9a36c0200686716eeea05 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Tue, 7 Oct 2014 15:55:33 +0530 Subject: powerpc/book3s: Don't clear MSR_RI in hmi handler. In HMI interrupt handler we don't touch SRR0/SRR1, instead we touch HSRR0/HSRR1. Hence we don't need to clear MSR_RI bit. Signed-off-by: Mahesh Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 050f79a4a168..72e783ea0681 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1270,11 +1270,6 @@ hmi_exception_early: addi r3,r1,STACK_FRAME_OVERHEAD bl hmi_exception_realmode /* Windup the stack. */ - /* Clear MSR_RI before setting SRR0 and SRR1. */ - li r0,MSR_RI - mfmsr r9 /* get MSR value */ - andc r9,r9,r0 - mtmsrd r9,1 /* Clear MSR_RI */ /* Move original HSRR0 and HSRR1 into the respective regs */ ld r9,_MSR(r1) mtspr SPRN_HSRR1,r9 -- cgit v1.2.3 From 4384b8fe162d8aa03905d02073707bcf364cc7ce Mon Sep 17 00:00:00 2001 From: Lan Tianyu Date: Wed, 3 Sep 2014 15:13:30 +0800 Subject: Thermal: introduce int3403 thermal driver ACPI INT3403 device object can be used to retrieve temperature date from temperature sensors present in the system, and to expose device' performance control. The previous INT3403 thermal driver supports temperature reporting only, thus remove it and introduce this new & enhanced one. Signed-off-by: Lan Tianyu Signed-off-by: Zhang Rui --- drivers/thermal/Kconfig | 15 - drivers/thermal/Makefile | 1 - drivers/thermal/int3403_thermal.c | 296 -------------- drivers/thermal/int340x_thermal/Makefile | 1 + drivers/thermal/int340x_thermal/int3403_thermal.c | 477 ++++++++++++++++++++++ 5 files changed, 478 insertions(+), 312 deletions(-) delete mode 100644 drivers/thermal/int3403_thermal.c create mode 100644 drivers/thermal/int340x_thermal/int3403_thermal.c diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index b34c5f54fc83..6f93e5c4e1f2 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -207,21 +207,6 @@ config X86_PKG_TEMP_THERMAL two trip points which can be set by user to get notifications via thermal notification methods. -config ACPI_INT3403_THERMAL - tristate "ACPI INT3403 thermal driver" - depends on X86 && ACPI - help - Newer laptops and tablets that use ACPI may have thermal sensors - outside the core CPU/SOC for thermal safety reasons. These - temperature sensors are also exposed for the OS to use via the so - called INT3403 ACPI object. This driver will, on devices that have - such sensors, expose the temperature information from these sensors - to userspace via the normal thermal framework. This means that a wide - range of applications and GUI widgets can show this information to - the user or use this information for making decisions. For example, - the Intel Thermal Daemon can use this information to allow the user - to select his laptop to run without turning on the fans. - config INTEL_SOC_DTS_THERMAL tristate "Intel SoCs DTS thermal driver" depends on X86 && IOSF_MBI diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 216503eaa232..39b85b5b45fc 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -31,6 +31,5 @@ obj-$(CONFIG_INTEL_POWERCLAMP) += intel_powerclamp.o obj-$(CONFIG_X86_PKG_TEMP_THERMAL) += x86_pkg_temp_thermal.o obj-$(CONFIG_INTEL_SOC_DTS_THERMAL) += intel_soc_dts_thermal.o obj-$(CONFIG_TI_SOC_THERMAL) += ti-soc-thermal/ -obj-$(CONFIG_ACPI_INT3403_THERMAL) += int3403_thermal.o obj-$(CONFIG_INT340X_THERMAL) += int340x_thermal/ obj-$(CONFIG_ST_THERMAL) += st/ diff --git a/drivers/thermal/int3403_thermal.c b/drivers/thermal/int3403_thermal.c deleted file mode 100644 index 17554eeb3953..000000000000 --- a/drivers/thermal/int3403_thermal.c +++ /dev/null @@ -1,296 +0,0 @@ -/* - * ACPI INT3403 thermal driver - * Copyright (c) 2013, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#include -#include -#include -#include -#include -#include - -#define INT3403_TYPE_SENSOR 0x03 -#define INT3403_PERF_CHANGED_EVENT 0x80 -#define INT3403_THERMAL_EVENT 0x90 - -#define DECI_KELVIN_TO_MILLI_CELSIUS(t, off) (((t) - (off)) * 100) -#define KELVIN_OFFSET 2732 -#define MILLI_CELSIUS_TO_DECI_KELVIN(t, off) (((t) / 100) + (off)) - -#define ACPI_INT3403_CLASS "int3403" -#define ACPI_INT3403_FILE_STATE "state" - -struct int3403_sensor { - struct thermal_zone_device *tzone; - unsigned long *thresholds; - unsigned long crit_temp; - int crit_trip_id; - unsigned long psv_temp; - int psv_trip_id; -}; - -static int sys_get_curr_temp(struct thermal_zone_device *tzone, - unsigned long *temp) -{ - struct acpi_device *device = tzone->devdata; - unsigned long long tmp; - acpi_status status; - - status = acpi_evaluate_integer(device->handle, "_TMP", NULL, &tmp); - if (ACPI_FAILURE(status)) - return -EIO; - - *temp = DECI_KELVIN_TO_MILLI_CELSIUS(tmp, KELVIN_OFFSET); - - return 0; -} - -static int sys_get_trip_hyst(struct thermal_zone_device *tzone, - int trip, unsigned long *temp) -{ - struct acpi_device *device = tzone->devdata; - unsigned long long hyst; - acpi_status status; - - status = acpi_evaluate_integer(device->handle, "GTSH", NULL, &hyst); - if (ACPI_FAILURE(status)) - return -EIO; - - /* - * Thermal hysteresis represents a temperature difference. - * Kelvin and Celsius have same degree size. So the - * conversion here between tenths of degree Kelvin unit - * and Milli-Celsius unit is just to multiply 100. - */ - *temp = hyst * 100; - - return 0; -} - -static int sys_get_trip_temp(struct thermal_zone_device *tzone, - int trip, unsigned long *temp) -{ - struct acpi_device *device = tzone->devdata; - struct int3403_sensor *obj = acpi_driver_data(device); - - if (trip == obj->crit_trip_id) - *temp = obj->crit_temp; - else if (trip == obj->psv_trip_id) - *temp = obj->psv_temp; - else { - /* - * get_trip_temp is a mandatory callback but - * PATx method doesn't return any value, so return - * cached value, which was last set from user space. - */ - *temp = obj->thresholds[trip]; - } - - return 0; -} - -static int sys_get_trip_type(struct thermal_zone_device *thermal, - int trip, enum thermal_trip_type *type) -{ - struct acpi_device *device = thermal->devdata; - struct int3403_sensor *obj = acpi_driver_data(device); - - /* Mandatory callback, may not mean much here */ - if (trip == obj->crit_trip_id) - *type = THERMAL_TRIP_CRITICAL; - else - *type = THERMAL_TRIP_PASSIVE; - - return 0; -} - -int sys_set_trip_temp(struct thermal_zone_device *tzone, int trip, - unsigned long temp) -{ - struct acpi_device *device = tzone->devdata; - acpi_status status; - char name[10]; - int ret = 0; - struct int3403_sensor *obj = acpi_driver_data(device); - - snprintf(name, sizeof(name), "PAT%d", trip); - if (acpi_has_method(device->handle, name)) { - status = acpi_execute_simple_method(device->handle, name, - MILLI_CELSIUS_TO_DECI_KELVIN(temp, - KELVIN_OFFSET)); - if (ACPI_FAILURE(status)) - ret = -EIO; - else - obj->thresholds[trip] = temp; - } else { - ret = -EIO; - dev_err(&device->dev, "sys_set_trip_temp: method not found\n"); - } - - return ret; -} - -static struct thermal_zone_device_ops tzone_ops = { - .get_temp = sys_get_curr_temp, - .get_trip_temp = sys_get_trip_temp, - .get_trip_type = sys_get_trip_type, - .set_trip_temp = sys_set_trip_temp, - .get_trip_hyst = sys_get_trip_hyst, -}; - -static void acpi_thermal_notify(struct acpi_device *device, u32 event) -{ - struct int3403_sensor *obj; - - if (!device) - return; - - obj = acpi_driver_data(device); - if (!obj) - return; - - switch (event) { - case INT3403_PERF_CHANGED_EVENT: - break; - case INT3403_THERMAL_EVENT: - thermal_zone_device_update(obj->tzone); - break; - default: - dev_err(&device->dev, "Unsupported event [0x%x]\n", event); - break; - } -} - -static int sys_get_trip_crt(struct acpi_device *device, unsigned long *temp) -{ - unsigned long long crt; - acpi_status status; - - status = acpi_evaluate_integer(device->handle, "_CRT", NULL, &crt); - if (ACPI_FAILURE(status)) - return -EIO; - - *temp = DECI_KELVIN_TO_MILLI_CELSIUS(crt, KELVIN_OFFSET); - - return 0; -} - -static int sys_get_trip_psv(struct acpi_device *device, unsigned long *temp) -{ - unsigned long long psv; - acpi_status status; - - status = acpi_evaluate_integer(device->handle, "_PSV", NULL, &psv); - if (ACPI_FAILURE(status)) - return -EIO; - - *temp = DECI_KELVIN_TO_MILLI_CELSIUS(psv, KELVIN_OFFSET); - - return 0; -} - -static int acpi_int3403_add(struct acpi_device *device) -{ - int result = 0; - unsigned long long ptyp; - acpi_status status; - struct int3403_sensor *obj; - unsigned long long trip_cnt; - int trip_mask = 0; - - if (!device) - return -EINVAL; - - status = acpi_evaluate_integer(device->handle, "PTYP", NULL, &ptyp); - if (ACPI_FAILURE(status)) - return -EINVAL; - - if (ptyp != INT3403_TYPE_SENSOR) - return -EINVAL; - - obj = devm_kzalloc(&device->dev, sizeof(*obj), GFP_KERNEL); - if (!obj) - return -ENOMEM; - - device->driver_data = obj; - - status = acpi_evaluate_integer(device->handle, "PATC", NULL, - &trip_cnt); - if (ACPI_FAILURE(status)) - trip_cnt = 0; - - if (trip_cnt) { - /* We have to cache, thresholds can't be readback */ - obj->thresholds = devm_kzalloc(&device->dev, - sizeof(*obj->thresholds) * trip_cnt, - GFP_KERNEL); - if (!obj->thresholds) - return -ENOMEM; - trip_mask = BIT(trip_cnt) - 1; - } - - obj->psv_trip_id = -1; - if (!sys_get_trip_psv(device, &obj->psv_temp)) - obj->psv_trip_id = trip_cnt++; - - obj->crit_trip_id = -1; - if (!sys_get_trip_crt(device, &obj->crit_temp)) - obj->crit_trip_id = trip_cnt++; - - obj->tzone = thermal_zone_device_register(acpi_device_bid(device), - trip_cnt, trip_mask, device, &tzone_ops, - NULL, 0, 0); - if (IS_ERR(obj->tzone)) { - result = PTR_ERR(obj->tzone); - return result; - } - - strcpy(acpi_device_name(device), "INT3403"); - strcpy(acpi_device_class(device), ACPI_INT3403_CLASS); - - return 0; -} - -static int acpi_int3403_remove(struct acpi_device *device) -{ - struct int3403_sensor *obj; - - obj = acpi_driver_data(device); - thermal_zone_device_unregister(obj->tzone); - - return 0; -} - -ACPI_MODULE_NAME("int3403"); -static const struct acpi_device_id int3403_device_ids[] = { - {"INT3403", 0}, - {"", 0}, -}; -MODULE_DEVICE_TABLE(acpi, int3403_device_ids); - -static struct acpi_driver acpi_int3403_driver = { - .name = "INT3403", - .class = ACPI_INT3403_CLASS, - .ids = int3403_device_ids, - .ops = { - .add = acpi_int3403_add, - .remove = acpi_int3403_remove, - .notify = acpi_thermal_notify, - }, -}; - -module_acpi_driver(acpi_int3403_driver); - -MODULE_AUTHOR("Srinivas Pandruvada "); -MODULE_LICENSE("GPL v2"); -MODULE_DESCRIPTION("ACPI INT3403 thermal driver"); diff --git a/drivers/thermal/int340x_thermal/Makefile b/drivers/thermal/int340x_thermal/Makefile index 67c98fd24200..c4a5c50d7ee4 100644 --- a/drivers/thermal/int340x_thermal/Makefile +++ b/drivers/thermal/int340x_thermal/Makefile @@ -1,2 +1,3 @@ obj-$(CONFIG_INT340X_THERMAL) += int3400_thermal.o obj-$(CONFIG_INT340X_THERMAL) += int3402_thermal.o +obj-$(CONFIG_INT340X_THERMAL) += int3403_thermal.o diff --git a/drivers/thermal/int340x_thermal/int3403_thermal.c b/drivers/thermal/int340x_thermal/int3403_thermal.c new file mode 100644 index 000000000000..d20dba986f0f --- /dev/null +++ b/drivers/thermal/int340x_thermal/int3403_thermal.c @@ -0,0 +1,477 @@ +/* + * ACPI INT3403 thermal driver + * Copyright (c) 2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define INT3403_TYPE_SENSOR 0x03 +#define INT3403_TYPE_CHARGER 0x0B +#define INT3403_TYPE_BATTERY 0x0C +#define INT3403_PERF_CHANGED_EVENT 0x80 +#define INT3403_THERMAL_EVENT 0x90 + +#define DECI_KELVIN_TO_MILLI_CELSIUS(t, off) (((t) - (off)) * 100) +#define KELVIN_OFFSET 2732 +#define MILLI_CELSIUS_TO_DECI_KELVIN(t, off) (((t) / 100) + (off)) + +struct int3403_sensor { + struct thermal_zone_device *tzone; + unsigned long *thresholds; + unsigned long crit_temp; + int crit_trip_id; + unsigned long psv_temp; + int psv_trip_id; + +}; + +struct int3403_performance_state { + u64 performance; + u64 power; + u64 latency; + u64 linear; + u64 control; + u64 raw_performace; + char *raw_unit; + int reserved; +}; + +struct int3403_cdev { + struct thermal_cooling_device *cdev; + unsigned long max_state; +}; + +struct int3403_priv { + struct platform_device *pdev; + struct acpi_device *adev; + unsigned long long type; + void *priv; +}; + +static int sys_get_curr_temp(struct thermal_zone_device *tzone, + unsigned long *temp) +{ + struct int3403_priv *priv = tzone->devdata; + struct acpi_device *device = priv->adev; + unsigned long long tmp; + acpi_status status; + + status = acpi_evaluate_integer(device->handle, "_TMP", NULL, &tmp); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLI_CELSIUS(tmp, KELVIN_OFFSET); + + return 0; +} + +static int sys_get_trip_hyst(struct thermal_zone_device *tzone, + int trip, unsigned long *temp) +{ + struct int3403_priv *priv = tzone->devdata; + struct acpi_device *device = priv->adev; + unsigned long long hyst; + acpi_status status; + + status = acpi_evaluate_integer(device->handle, "GTSH", NULL, &hyst); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLI_CELSIUS(hyst, KELVIN_OFFSET); + + return 0; +} + +static int sys_get_trip_temp(struct thermal_zone_device *tzone, + int trip, unsigned long *temp) +{ + struct int3403_priv *priv = tzone->devdata; + struct int3403_sensor *obj = priv->priv; + + if (priv->type != INT3403_TYPE_SENSOR || !obj) + return -EINVAL; + + if (trip == obj->crit_trip_id) + *temp = obj->crit_temp; + else if (trip == obj->psv_trip_id) + *temp = obj->psv_temp; + else { + /* + * get_trip_temp is a mandatory callback but + * PATx method doesn't return any value, so return + * cached value, which was last set from user space + */ + *temp = obj->thresholds[trip]; + } + + return 0; +} + +static int sys_get_trip_type(struct thermal_zone_device *thermal, + int trip, enum thermal_trip_type *type) +{ + struct int3403_priv *priv = thermal->devdata; + struct int3403_sensor *obj = priv->priv; + + /* Mandatory callback, may not mean much here */ + if (trip == obj->crit_trip_id) + *type = THERMAL_TRIP_CRITICAL; + else + *type = THERMAL_TRIP_PASSIVE; + + return 0; +} + +int sys_set_trip_temp(struct thermal_zone_device *tzone, int trip, + unsigned long temp) +{ + struct int3403_priv *priv = tzone->devdata; + struct acpi_device *device = priv->adev; + struct int3403_sensor *obj = priv->priv; + acpi_status status; + char name[10]; + int ret = 0; + + snprintf(name, sizeof(name), "PAT%d", trip); + if (acpi_has_method(device->handle, name)) { + status = acpi_execute_simple_method(device->handle, name, + MILLI_CELSIUS_TO_DECI_KELVIN(temp, + KELVIN_OFFSET)); + if (ACPI_FAILURE(status)) + ret = -EIO; + else + obj->thresholds[trip] = temp; + } else { + ret = -EIO; + dev_err(&device->dev, "sys_set_trip_temp: method not found\n"); + } + + return ret; +} + +static struct thermal_zone_device_ops tzone_ops = { + .get_temp = sys_get_curr_temp, + .get_trip_temp = sys_get_trip_temp, + .get_trip_type = sys_get_trip_type, + .set_trip_temp = sys_set_trip_temp, + .get_trip_hyst = sys_get_trip_hyst, +}; + +static struct thermal_zone_params int3403_thermal_params = { + .governor_name = "user_space", + .no_hwmon = true, +}; + +static void int3403_notify(acpi_handle handle, + u32 event, void *data) +{ + struct int3403_priv *priv = data; + struct int3403_sensor *obj; + + if (!priv) + return; + + obj = priv->priv; + if (priv->type != INT3403_TYPE_SENSOR || !obj) + return; + + switch (event) { + case INT3403_PERF_CHANGED_EVENT: + break; + case INT3403_THERMAL_EVENT: + thermal_zone_device_update(obj->tzone); + break; + default: + dev_err(&priv->pdev->dev, "Unsupported event [0x%x]\n", event); + break; + } +} + +static int sys_get_trip_crt(struct acpi_device *device, unsigned long *temp) +{ + unsigned long long crt; + acpi_status status; + + status = acpi_evaluate_integer(device->handle, "_CRT", NULL, &crt); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLI_CELSIUS(crt, KELVIN_OFFSET); + + return 0; +} + +static int sys_get_trip_psv(struct acpi_device *device, unsigned long *temp) +{ + unsigned long long psv; + acpi_status status; + + status = acpi_evaluate_integer(device->handle, "_PSV", NULL, &psv); + if (ACPI_FAILURE(status)) + return -EIO; + + *temp = DECI_KELVIN_TO_MILLI_CELSIUS(psv, KELVIN_OFFSET); + + return 0; +} + +static int int3403_sensor_add(struct int3403_priv *priv) +{ + int result = 0; + acpi_status status; + struct int3403_sensor *obj; + unsigned long long trip_cnt; + int trip_mask = 0; + + obj = devm_kzalloc(&priv->pdev->dev, sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + priv->priv = obj; + + status = acpi_evaluate_integer(priv->adev->handle, "PATC", NULL, + &trip_cnt); + if (ACPI_FAILURE(status)) + trip_cnt = 0; + + if (trip_cnt) { + /* We have to cache, thresholds can't be readback */ + obj->thresholds = devm_kzalloc(&priv->pdev->dev, + sizeof(*obj->thresholds) * trip_cnt, + GFP_KERNEL); + if (!obj->thresholds) { + result = -ENOMEM; + goto err_free_obj; + } + trip_mask = BIT(trip_cnt) - 1; + } + + obj->psv_trip_id = -1; + if (!sys_get_trip_psv(priv->adev, &obj->psv_temp)) + obj->psv_trip_id = trip_cnt++; + + obj->crit_trip_id = -1; + if (!sys_get_trip_crt(priv->adev, &obj->crit_temp)) + obj->crit_trip_id = trip_cnt++; + + obj->tzone = thermal_zone_device_register(acpi_device_bid(priv->adev), + trip_cnt, trip_mask, priv, &tzone_ops, + &int3403_thermal_params, 0, 0); + if (IS_ERR(obj->tzone)) { + result = PTR_ERR(obj->tzone); + obj->tzone = NULL; + goto err_free_obj; + } + + result = acpi_install_notify_handler(priv->adev->handle, + ACPI_DEVICE_NOTIFY, int3403_notify, + (void *)priv); + if (result) + goto err_free_obj; + + return 0; + + err_free_obj: + if (obj->tzone) + thermal_zone_device_unregister(obj->tzone); + return result; +} + +static int int3403_sensor_remove(struct int3403_priv *priv) +{ + struct int3403_sensor *obj = priv->priv; + + thermal_zone_device_unregister(obj->tzone); + return 0; +} + +/* INT3403 Cooling devices */ +static int int3403_get_max_state(struct thermal_cooling_device *cdev, + unsigned long *state) +{ + struct int3403_priv *priv = cdev->devdata; + struct int3403_cdev *obj = priv->priv; + + *state = obj->max_state; + return 0; +} + +static int int3403_get_cur_state(struct thermal_cooling_device *cdev, + unsigned long *state) +{ + struct int3403_priv *priv = cdev->devdata; + unsigned long long level; + acpi_status status; + + status = acpi_evaluate_integer(priv->adev->handle, "PPPC", NULL, &level); + if (ACPI_SUCCESS(status)) { + *state = level; + return 0; + } else + return -EINVAL; +} + +static int +int3403_set_cur_state(struct thermal_cooling_device *cdev, unsigned long state) +{ + struct int3403_priv *priv = cdev->devdata; + acpi_status status; + + status = acpi_execute_simple_method(priv->adev->handle, "SPPC", state); + if (ACPI_SUCCESS(status)) + return 0; + else + return -EINVAL; +} + +static const struct thermal_cooling_device_ops int3403_cooling_ops = { + .get_max_state = int3403_get_max_state, + .get_cur_state = int3403_get_cur_state, + .set_cur_state = int3403_set_cur_state, +}; + +static int int3403_cdev_add(struct int3403_priv *priv) +{ + int result = 0; + acpi_status status; + struct int3403_cdev *obj; + struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL }; + union acpi_object *p; + + obj = devm_kzalloc(&priv->pdev->dev, sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + status = acpi_evaluate_object(priv->adev->handle, "PPSS", NULL, &buf); + if (ACPI_FAILURE(status)) + return -ENODEV; + + p = buf.pointer; + if (!p || (p->type != ACPI_TYPE_PACKAGE)) { + printk(KERN_WARNING "Invalid PPSS data\n"); + return -EFAULT; + } + + obj->max_state = p->package.count - 1; + obj->cdev = + thermal_cooling_device_register(acpi_device_bid(priv->adev), + priv, &int3403_cooling_ops); + if (IS_ERR(obj->cdev)) + result = PTR_ERR(obj->cdev); + + priv->priv = obj; + + /* TODO: add ACPI notification support */ + + return result; +} + +static int int3403_cdev_remove(struct int3403_priv *priv) +{ + struct int3403_cdev *obj = priv->priv; + + thermal_cooling_device_unregister(obj->cdev); + return 0; +} + +static int int3403_add(struct platform_device *pdev) +{ + struct int3403_priv *priv; + int result = 0; + acpi_status status; + + priv = devm_kzalloc(&pdev->dev, sizeof(struct int3403_priv), + GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->pdev = pdev; + priv->adev = ACPI_COMPANION(&(pdev->dev)); + if (!priv->adev) { + result = -EINVAL; + goto err; + } + + status = acpi_evaluate_integer(priv->adev->handle, "PTYP", + NULL, &priv->type); + if (ACPI_FAILURE(status)) { + result = -EINVAL; + goto err; + } + + platform_set_drvdata(pdev, priv); + switch (priv->type) { + case INT3403_TYPE_SENSOR: + result = int3403_sensor_add(priv); + break; + case INT3403_TYPE_CHARGER: + case INT3403_TYPE_BATTERY: + result = int3403_cdev_add(priv); + break; + default: + result = -EINVAL; + } + + if (result) + goto err; + return result; + +err: + return result; +} + +static int int3403_remove(struct platform_device *pdev) +{ + struct int3403_priv *priv = platform_get_drvdata(pdev); + + switch (priv->type) { + case INT3403_TYPE_SENSOR: + int3403_sensor_remove(priv); + break; + case INT3403_TYPE_CHARGER: + case INT3403_TYPE_BATTERY: + int3403_cdev_remove(priv); + break; + default: + break; + } + + return 0; +} + +static const struct acpi_device_id int3403_device_ids[] = { + {"INT3403", 0}, + {"", 0}, +}; +MODULE_DEVICE_TABLE(acpi, int3403_device_ids); + +static struct platform_driver int3403_driver = { + .probe = int3403_add, + .remove = int3403_remove, + .driver = { + .name = "int3403 thermal", + .owner = THIS_MODULE, + .acpi_match_table = int3403_device_ids, + }, +}; + +module_platform_driver(int3403_driver); + +MODULE_AUTHOR("Srinivas Pandruvada "); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("ACPI INT3403 thermal driver"); -- cgit v1.2.3 From 52b1c69d7e3cd8bdba0e55bde24093f0779bb29d Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Wed, 3 Sep 2014 15:14:23 +0800 Subject: Thermal: int340x_thermal: expose acpi thermal relationship tables ACPI 4.0 introduced two thermal relationship tables via _ART (active cooling) and _TRT (passive cooling) objects. These tables contain many to many relationships among thermal sensors and cooling devices. This patch parses _ART and _TRT and makes the result available to the userspace via an misc device interface. At the same time, kernel drivers can also request parsing results from internal kernel APIs. The results include source and target devices, influence, and sampling rate in case of _TRT. For _ART, the result shows source device, target device, and weight percentage. Signed-off-by: Jacob Pan Signed-off-by: Zhang Rui --- drivers/thermal/Kconfig | 5 + drivers/thermal/int340x_thermal/Makefile | 1 + drivers/thermal/int340x_thermal/acpi_thermal_rel.c | 400 +++++++++++++++++++++ drivers/thermal/int340x_thermal/acpi_thermal_rel.h | 84 +++++ 4 files changed, 490 insertions(+) create mode 100644 drivers/thermal/int340x_thermal/acpi_thermal_rel.c create mode 100644 drivers/thermal/int340x_thermal/acpi_thermal_rel.h diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 6f93e5c4e1f2..3a8929222cab 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -223,6 +223,7 @@ config INT340X_THERMAL tristate "ACPI INT340X thermal drivers" depends on X86 && ACPI select THERMAL_GOV_USER_SPACE + select ACPI_THERMAL_REL help Newer laptops and tablets that use ACPI may have thermal sensors and other devices with thermal control capabilities outside the core @@ -237,6 +238,10 @@ config INT340X_THERMAL information to allow the user to select his laptop to run without turning on the fans. +config ACPI_THERMAL_REL + tristate + depends on ACPI + menu "Texas Instruments thermal drivers" source "drivers/thermal/ti-soc-thermal/Kconfig" endmenu diff --git a/drivers/thermal/int340x_thermal/Makefile b/drivers/thermal/int340x_thermal/Makefile index c4a5c50d7ee4..ffe40bffaf1a 100644 --- a/drivers/thermal/int340x_thermal/Makefile +++ b/drivers/thermal/int340x_thermal/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_INT340X_THERMAL) += int3400_thermal.o obj-$(CONFIG_INT340X_THERMAL) += int3402_thermal.o obj-$(CONFIG_INT340X_THERMAL) += int3403_thermal.o +obj-$(CONFIG_ACPI_THERMAL_REL) += acpi_thermal_rel.o diff --git a/drivers/thermal/int340x_thermal/acpi_thermal_rel.c b/drivers/thermal/int340x_thermal/acpi_thermal_rel.c new file mode 100644 index 000000000000..0d8db808f0ae --- /dev/null +++ b/drivers/thermal/int340x_thermal/acpi_thermal_rel.c @@ -0,0 +1,400 @@ +/* acpi_thermal_rel.c driver for exporting ACPI thermal relationship + * + * Copyright (c) 2014 Intel Corp + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + */ + +/* + * Two functionalities included: + * 1. Export _TRT, _ART, via misc device interface to the userspace. + * 2. Provide parsing result to kernel drivers + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "acpi_thermal_rel.h" + +static acpi_handle acpi_thermal_rel_handle; +static DEFINE_SPINLOCK(acpi_thermal_rel_chrdev_lock); +static int acpi_thermal_rel_chrdev_count; /* #times opened */ +static int acpi_thermal_rel_chrdev_exclu; /* already open exclusive? */ + +static int acpi_thermal_rel_open(struct inode *inode, struct file *file) +{ + spin_lock(&acpi_thermal_rel_chrdev_lock); + if (acpi_thermal_rel_chrdev_exclu || + (acpi_thermal_rel_chrdev_count && (file->f_flags & O_EXCL))) { + spin_unlock(&acpi_thermal_rel_chrdev_lock); + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + acpi_thermal_rel_chrdev_exclu = 1; + acpi_thermal_rel_chrdev_count++; + + spin_unlock(&acpi_thermal_rel_chrdev_lock); + + return nonseekable_open(inode, file); +} + +static int acpi_thermal_rel_release(struct inode *inode, struct file *file) +{ + spin_lock(&acpi_thermal_rel_chrdev_lock); + acpi_thermal_rel_chrdev_count--; + acpi_thermal_rel_chrdev_exclu = 0; + spin_unlock(&acpi_thermal_rel_chrdev_lock); + + return 0; +} + +/** + * acpi_parse_trt - Thermal Relationship Table _TRT for passive cooling + * + * @handle: ACPI handle of the device contains _TRT + * @art_count: the number of valid entries resulted from parsing _TRT + * @artp: pointer to pointer of array of art entries in parsing result + * @create_dev: whether to create platform devices for target and source + * + */ +int acpi_parse_trt(acpi_handle handle, int *trt_count, struct trt **trtp, + bool create_dev) +{ + acpi_status status; + int result = 0; + int i; + int nr_bad_entries = 0; + struct trt *trts; + struct acpi_device *adev; + union acpi_object *p; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_buffer element = { 0, NULL }; + struct acpi_buffer trt_format = { sizeof("RRNNNNNN"), "RRNNNNNN" }; + + if (!acpi_has_method(handle, "_TRT")) + return 0; + + status = acpi_evaluate_object(handle, "_TRT", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + p = buffer.pointer; + if (!p || (p->type != ACPI_TYPE_PACKAGE)) { + pr_err("Invalid _TRT data\n"); + result = -EFAULT; + goto end; + } + + *trt_count = p->package.count; + trts = kzalloc(*trt_count * sizeof(struct trt), GFP_KERNEL); + if (!trts) { + result = -ENOMEM; + goto end; + } + + for (i = 0; i < *trt_count; i++) { + struct trt *trt = &trts[i - nr_bad_entries]; + + element.length = sizeof(struct trt); + element.pointer = trt; + + status = acpi_extract_package(&(p->package.elements[i]), + &trt_format, &element); + if (ACPI_FAILURE(status)) { + nr_bad_entries++; + pr_warn("_TRT package %d is invalid, ignored\n", i); + continue; + } + if (!create_dev) + continue; + + result = acpi_bus_get_device(trt->source, &adev); + if (!result) + acpi_create_platform_device(adev); + else + pr_warn("Failed to get source ACPI device\n"); + + result = acpi_bus_get_device(trt->target, &adev); + if (!result) + acpi_create_platform_device(adev); + else + pr_warn("Failed to get target ACPI device\n"); + } + + *trtp = trts; + /* don't count bad entries */ + *trt_count -= nr_bad_entries; +end: + kfree(buffer.pointer); + return result; +} +EXPORT_SYMBOL(acpi_parse_trt); + +/** + * acpi_parse_art - Parse Active Relationship Table _ART + * + * @handle: ACPI handle of the device contains _ART + * @art_count: the number of valid entries resulted from parsing _ART + * @artp: pointer to pointer of array of art entries in parsing result + * @create_dev: whether to create platform devices for target and source + * + */ +int acpi_parse_art(acpi_handle handle, int *art_count, struct art **artp, + bool create_dev) +{ + acpi_status status; + int result = 0; + int i; + int nr_bad_entries = 0; + struct art *arts; + struct acpi_device *adev; + union acpi_object *p; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_buffer element = { 0, NULL }; + struct acpi_buffer art_format = { + sizeof("RRNNNNNNNNNNN"), "RRNNNNNNNNNNN" }; + + if (!acpi_has_method(handle, "_ART")) + return 0; + + status = acpi_evaluate_object(handle, "_ART", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + p = buffer.pointer; + if (!p || (p->type != ACPI_TYPE_PACKAGE)) { + pr_err("Invalid _ART data\n"); + result = -EFAULT; + goto end; + } + + /* ignore p->package.elements[0], as this is _ART Revision field */ + *art_count = p->package.count - 1; + arts = kzalloc(*art_count * sizeof(struct art), GFP_KERNEL); + if (!arts) { + result = -ENOMEM; + goto end; + } + + for (i = 0; i < *art_count; i++) { + struct art *art = &arts[i - nr_bad_entries]; + + element.length = sizeof(struct art); + element.pointer = art; + + status = acpi_extract_package(&(p->package.elements[i + 1]), + &art_format, &element); + if (ACPI_FAILURE(status)) { + pr_warn("_ART package %d is invalid, ignored", i); + nr_bad_entries++; + continue; + } + if (!create_dev) + continue; + + if (art->source) { + result = acpi_bus_get_device(art->source, &adev); + if (!result) + acpi_create_platform_device(adev); + else + pr_warn("Failed to get source ACPI device\n"); + } + if (art->target) { + result = acpi_bus_get_device(art->target, &adev); + if (!result) + acpi_create_platform_device(adev); + else + pr_warn("Failed to get source ACPI device\n"); + } + } + + *artp = arts; + /* don't count bad entries */ + *art_count -= nr_bad_entries; +end: + kfree(buffer.pointer); + return result; +} +EXPORT_SYMBOL(acpi_parse_art); + + +/* get device name from acpi handle */ +static void get_single_name(acpi_handle handle, char *name) +{ + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER}; + + if (ACPI_FAILURE(acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer))) + pr_warn("Failed get name from handle\n"); + else { + memcpy(name, buffer.pointer, ACPI_NAME_SIZE); + kfree(buffer.pointer); + } +} + +static int fill_art(char __user *ubuf) +{ + int i; + int ret; + int count; + int art_len; + struct art *arts = NULL; + union art_object *art_user; + + ret = acpi_parse_art(acpi_thermal_rel_handle, &count, &arts, false); + if (ret) + goto free_art; + art_len = count * sizeof(union art_object); + art_user = kzalloc(art_len, GFP_KERNEL); + if (!art_user) { + ret = -ENOMEM; + goto free_art; + } + /* now fill in user art data */ + for (i = 0; i < count; i++) { + /* userspace art needs device name instead of acpi reference */ + get_single_name(arts[i].source, art_user[i].source_device); + get_single_name(arts[i].target, art_user[i].target_device); + /* copy the rest int data in addition to source and target */ + memcpy(&art_user[i].weight, &arts[i].weight, + sizeof(u64) * (ACPI_NR_ART_ELEMENTS - 2)); + } + + if (copy_to_user(ubuf, art_user, art_len)) + ret = -EFAULT; + kfree(art_user); +free_art: + kfree(arts); + return ret; +} + +static int fill_trt(char __user *ubuf) +{ + int i; + int ret; + int count; + int trt_len; + struct trt *trts = NULL; + union trt_object *trt_user; + + ret = acpi_parse_trt(acpi_thermal_rel_handle, &count, &trts, false); + if (ret) + goto free_trt; + trt_len = count * sizeof(union trt_object); + trt_user = kzalloc(trt_len, GFP_KERNEL); + if (!trt_user) { + ret = -ENOMEM; + goto free_trt; + } + /* now fill in user trt data */ + for (i = 0; i < count; i++) { + /* userspace trt needs device name instead of acpi reference */ + get_single_name(trts[i].source, trt_user[i].source_device); + get_single_name(trts[i].target, trt_user[i].target_device); + trt_user[i].sample_period = trts[i].sample_period; + trt_user[i].influence = trts[i].influence; + } + + if (copy_to_user(ubuf, trt_user, trt_len)) + ret = -EFAULT; + kfree(trt_user); +free_trt: + kfree(trts); + return ret; +} + +static long acpi_thermal_rel_ioctl(struct file *f, unsigned int cmd, + unsigned long __arg) +{ + int ret = 0; + unsigned long length = 0; + unsigned long count = 0; + char __user *arg = (void __user *)__arg; + struct trt *trts; + struct art *arts; + + switch (cmd) { + case ACPI_THERMAL_GET_TRT_COUNT: + ret = acpi_parse_trt(acpi_thermal_rel_handle, (int *)&count, + &trts, false); + kfree(trts); + if (!ret) + return put_user(count, (unsigned long __user *)__arg); + return ret; + case ACPI_THERMAL_GET_TRT_LEN: + ret = acpi_parse_trt(acpi_thermal_rel_handle, (int *)&count, + &trts, false); + kfree(trts); + length = count * sizeof(union trt_object); + if (!ret) + return put_user(length, (unsigned long __user *)__arg); + return ret; + case ACPI_THERMAL_GET_TRT: + return fill_trt(arg); + case ACPI_THERMAL_GET_ART_COUNT: + ret = acpi_parse_art(acpi_thermal_rel_handle, (int *)&count, + &arts, false); + kfree(arts); + if (!ret) + return put_user(count, (unsigned long __user *)__arg); + return ret; + case ACPI_THERMAL_GET_ART_LEN: + ret = acpi_parse_art(acpi_thermal_rel_handle, (int *)&count, + &arts, false); + kfree(arts); + length = count * sizeof(union art_object); + if (!ret) + return put_user(length, (unsigned long __user *)__arg); + return ret; + + case ACPI_THERMAL_GET_ART: + return fill_art(arg); + + default: + return -ENOTTY; + } +} + +static const struct file_operations acpi_thermal_rel_fops = { + .owner = THIS_MODULE, + .open = acpi_thermal_rel_open, + .release = acpi_thermal_rel_release, + .unlocked_ioctl = acpi_thermal_rel_ioctl, + .llseek = no_llseek, +}; + +static struct miscdevice acpi_thermal_rel_misc_device = { + .minor = MISC_DYNAMIC_MINOR, + "acpi_thermal_rel", + &acpi_thermal_rel_fops +}; + +int acpi_thermal_rel_misc_device_add(acpi_handle handle) +{ + acpi_thermal_rel_handle = handle; + + return misc_register(&acpi_thermal_rel_misc_device); +} +EXPORT_SYMBOL(acpi_thermal_rel_misc_device_add); + +int acpi_thermal_rel_misc_device_remove(acpi_handle handle) +{ + misc_deregister(&acpi_thermal_rel_misc_device); + + return 0; +} +EXPORT_SYMBOL(acpi_thermal_rel_misc_device_remove); + +MODULE_AUTHOR("Zhang Rui "); +MODULE_AUTHOR("Jacob Pan + +#define ACPI_THERMAL_MAGIC 's' + +#define ACPI_THERMAL_GET_TRT_LEN _IOR(ACPI_THERMAL_MAGIC, 1, unsigned long) +#define ACPI_THERMAL_GET_ART_LEN _IOR(ACPI_THERMAL_MAGIC, 2, unsigned long) +#define ACPI_THERMAL_GET_TRT_COUNT _IOR(ACPI_THERMAL_MAGIC, 3, unsigned long) +#define ACPI_THERMAL_GET_ART_COUNT _IOR(ACPI_THERMAL_MAGIC, 4, unsigned long) + +#define ACPI_THERMAL_GET_TRT _IOR(ACPI_THERMAL_MAGIC, 5, unsigned long) +#define ACPI_THERMAL_GET_ART _IOR(ACPI_THERMAL_MAGIC, 6, unsigned long) + +struct art { + acpi_handle source; + acpi_handle target; + u64 weight; + u64 ac0_max; + u64 ac1_max; + u64 ac2_max; + u64 ac3_max; + u64 ac4_max; + u64 ac5_max; + u64 ac6_max; + u64 ac7_max; + u64 ac8_max; + u64 ac9_max; +} __packed; + +struct trt { + acpi_handle source; + acpi_handle target; + u64 influence; + u64 sample_period; + u64 reverved1; + u64 reverved2; + u64 reverved3; + u64 reverved4; +} __packed; + +#define ACPI_NR_ART_ELEMENTS 13 +/* for usrspace */ +union art_object { + struct { + char source_device[8]; /* ACPI single name */ + char target_device[8]; /* ACPI single name */ + u64 weight; + u64 ac0_max_level; + u64 ac1_max_level; + u64 ac2_max_level; + u64 ac3_max_level; + u64 ac4_max_level; + u64 ac5_max_level; + u64 ac6_max_level; + u64 ac7_max_level; + u64 ac8_max_level; + u64 ac9_max_level; + }; + u64 __data[ACPI_NR_ART_ELEMENTS]; +}; + +union trt_object { + struct { + char source_device[8]; /* ACPI single name */ + char target_device[8]; /* ACPI single name */ + u64 influence; + u64 sample_period; + u64 reserved[4]; + }; + u64 __data[8]; +}; + +#ifdef __KERNEL__ +int acpi_thermal_rel_misc_device_add(acpi_handle handle); +int acpi_thermal_rel_misc_device_remove(acpi_handle handle); +int acpi_parse_art(acpi_handle handle, int *art_count, struct art **arts, + bool create_dev); +int acpi_parse_trt(acpi_handle handle, int *trt_count, struct trt **trts, + bool create_dev); +#endif + +#endif /* __ACPI_ACPI_THERMAL_H */ -- cgit v1.2.3 From 6306e68a63f26219f3b1f948e96ba387ddccb272 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 12 Jun 2014 08:08:07 -0700 Subject: Thermal: int3400_thermal: use acpi_thermal_rel parsing APIs ACPI _TRT and _ART parsing code has been moved to acpi_thermal_rel such that it can be used by other devices in the future. Use the parsing APIs in acpi_thermal_rel.c instead. Signed-off-by: Jacob Pan Signed-off-by: Zhang Rui --- drivers/thermal/int340x_thermal/int3400_thermal.c | 172 ++-------------------- 1 file changed, 13 insertions(+), 159 deletions(-) diff --git a/drivers/thermal/int340x_thermal/int3400_thermal.c b/drivers/thermal/int340x_thermal/int3400_thermal.c index 9104b4f9381b..edc1cce117ba 100644 --- a/drivers/thermal/int340x_thermal/int3400_thermal.c +++ b/drivers/thermal/int340x_thermal/int3400_thermal.c @@ -14,33 +14,7 @@ #include #include #include - -struct art { - acpi_handle source; - acpi_handle target; - u64 weight; - u64 ac0_max; - u64 ac1_max; - u64 ac2_max; - u64 ac3_max; - u64 ac4_max; - u64 ac5_max; - u64 ac6_max; - u64 ac7_max; - u64 ac8_max; - u64 ac9_max; -}; - -struct trt { - acpi_handle source; - acpi_handle target; - u64 influence; - u64 sampling_period; - u64 reverved1; - u64 reverved2; - u64 reverved3; - u64 reverved4; -}; +#include "acpi_thermal_rel.h" enum int3400_thermal_uuid { INT3400_THERMAL_PASSIVE_1, @@ -68,6 +42,7 @@ struct int3400_thermal_priv { int trt_count; struct trt *trts; u8 uuid_bitmap; + int rel_misc_dev_res; }; static int int3400_thermal_get_uuids(struct int3400_thermal_priv *priv) @@ -146,136 +121,6 @@ static int int3400_thermal_run_osc(acpi_handle handle, return result; } - -static int parse_art(struct int3400_thermal_priv *priv) -{ - acpi_handle handle = priv->adev->handle; - acpi_status status; - int result = 0; - int i; - struct acpi_device *adev; - union acpi_object *p; - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; - struct acpi_buffer element = { 0, NULL }; - struct acpi_buffer art_format = { - sizeof("RRNNNNNNNNNNN"), "RRNNNNNNNNNNN" }; - - if (!acpi_has_method(handle, "_ART")) - return 0; - - status = acpi_evaluate_object(handle, "_ART", NULL, &buffer); - if (ACPI_FAILURE(status)) - return -ENODEV; - - p = buffer.pointer; - if (!p || (p->type != ACPI_TYPE_PACKAGE)) { - pr_err("Invalid _ART data\n"); - result = -EFAULT; - goto end; - } - - /* ignore p->package.elements[0], as this is _ART Revision field */ - priv->art_count = p->package.count - 1; - priv->arts = kzalloc(sizeof(struct art) * priv->art_count, GFP_KERNEL); - if (!priv->arts) { - result = -ENOMEM; - goto end; - } - - for (i = 0; i < priv->art_count; i++) { - struct art *art = &(priv->arts[i]); - - element.length = sizeof(struct art); - element.pointer = art; - - status = acpi_extract_package(&(p->package.elements[i + 1]), - &art_format, &element); - if (ACPI_FAILURE(status)) { - pr_err("Invalid _ART data"); - result = -EFAULT; - kfree(priv->arts); - goto end; - } - result = acpi_bus_get_device(art->source, &adev); - if (!result) - acpi_create_platform_device(adev, NULL); - else - pr_warn("Failed to get source ACPI device\n"); - result = acpi_bus_get_device(art->target, &adev); - if (!result) - acpi_create_platform_device(adev, NULL); - else - pr_warn("Failed to get source ACPI device\n"); - } -end: - kfree(buffer.pointer); - return result; -} - -static int parse_trt(struct int3400_thermal_priv *priv) -{ - acpi_handle handle = priv->adev->handle; - acpi_status status; - int result = 0; - int i; - struct acpi_device *adev; - union acpi_object *p; - struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; - struct acpi_buffer element = { 0, NULL }; - struct acpi_buffer trt_format = { sizeof("RRNNNNNN"), "RRNNNNNN" }; - - if (!acpi_has_method(handle, "_TRT")) - return 0; - - status = acpi_evaluate_object(handle, "_TRT", NULL, &buffer); - if (ACPI_FAILURE(status)) - return -ENODEV; - - p = buffer.pointer; - if (!p || (p->type != ACPI_TYPE_PACKAGE)) { - pr_err("Invalid _TRT data\n"); - result = -EFAULT; - goto end; - } - - priv->trt_count = p->package.count; - priv->trts = kzalloc(sizeof(struct trt) * priv->trt_count, GFP_KERNEL); - if (!priv->trts) { - result = -ENOMEM; - goto end; - } - - for (i = 0; i < priv->trt_count; i++) { - struct trt *trt = &(priv->trts[i]); - - element.length = sizeof(struct trt); - element.pointer = trt; - - status = acpi_extract_package(&(p->package.elements[i]), - &trt_format, &element); - if (ACPI_FAILURE(status)) { - pr_err("Invalid _ART data"); - result = -EFAULT; - kfree(priv->trts); - goto end; - } - - result = acpi_bus_get_device(trt->source, &adev); - if (!result) - acpi_create_platform_device(adev, NULL); - else - pr_warn("Failed to get source ACPI device\n"); - result = acpi_bus_get_device(trt->target, &adev); - if (!result) - acpi_create_platform_device(adev, NULL); - else - pr_warn("Failed to get target ACPI device\n"); - } -end: - kfree(buffer.pointer); - return result; -} - static int int3400_thermal_get_temp(struct thermal_zone_device *thermal, unsigned long *temp) { @@ -350,11 +195,14 @@ static int int3400_thermal_probe(struct platform_device *pdev) if (result) goto free_priv; - result = parse_art(priv); + result = acpi_parse_art(priv->adev->handle, &priv->art_count, + &priv->arts, true); if (result) goto free_priv; - result = parse_trt(priv); + + result = acpi_parse_trt(priv->adev->handle, &priv->trt_count, + &priv->trts, true); if (result) goto free_art; @@ -372,6 +220,9 @@ static int int3400_thermal_probe(struct platform_device *pdev) goto free_trt; } + priv->rel_misc_dev_res = acpi_thermal_rel_misc_device_add( + priv->adev->handle); + return 0; free_trt: kfree(priv->trts); @@ -386,6 +237,9 @@ static int int3400_thermal_remove(struct platform_device *pdev) { struct int3400_thermal_priv *priv = platform_get_drvdata(pdev); + if (!priv->rel_misc_dev_res) + acpi_thermal_rel_misc_device_remove(priv->adev->handle); + thermal_zone_device_unregister(priv->thermal); kfree(priv->trts); kfree(priv->arts); -- cgit v1.2.3 From 0ff8947fc5f700172b37cbca811a38eb9cb81e08 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 11 Oct 2014 19:51:17 -0400 Subject: ext4: fix reservation overflow in ext4_da_write_begin Delalloc write journal reservations only reserve 1 credit, to update the inode if necessary. However, it may happen once in a filesystem's lifetime that a file will cross the 2G threshold, and require the LARGE_FILE feature to be set in the superblock as well, if it was not set already. This overruns the transaction reservation, and can be demonstrated simply on any ext4 filesystem without the LARGE_FILE feature already set: dd if=/dev/zero of=testfile bs=1 seek=2147483646 count=1 \ conv=notrunc of=testfile sync dd if=/dev/zero of=testfile bs=1 seek=2147483647 count=1 \ conv=notrunc of=testfile leads to: EXT4-fs: ext4_do_update_inode:4296: aborting transaction: error 28 in __ext4_handle_dirty_super EXT4-fs error (device loop0) in ext4_do_update_inode:4301: error 28 EXT4-fs error (device loop0) in ext4_reserve_inode_write:4757: Readonly filesystem EXT4-fs error (device loop0) in ext4_dirty_inode:4876: error 28 EXT4-fs error (device loop0) in ext4_da_write_end:2685: error 28 Adjust the number of credits based on whether the flag is already set, and whether the current write may extend past the LARGE_FILE limit. Signed-off-by: Eric Sandeen Signed-off-by: Theodore Ts'o Reviewed-by: Andreas Dilger Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e204d8aabe7d..0dd9150c0c04 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2495,6 +2495,20 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } +/* We always reserve for an inode update; the superblock could be there too */ +static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) +{ + if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE))) + return 1; + + if (pos + len <= 0x7fffffffULL) + return 1; + + /* We might need to update the superblock to set LARGE_FILE */ + return 2; +} + static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2545,7 +2559,8 @@ retry_grab: * of file which has an already mapped buffer. */ retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_da_write_credits(inode, pos, len)); if (IS_ERR(handle)) { page_cache_release(page); return PTR_ERR(handle); -- cgit v1.2.3 From 65dd8327eb055a393a413a2214f70a9a10ff7ad6 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Sat, 11 Oct 2014 19:56:34 -0400 Subject: ext4: delete useless comments about ext4_move_extents In patch 'ext4: refactor ext4_move_extents code base', Dmitry Monakhov has refactored ext4_move_extents' implementation, but forgot to update the corresponding comments, this patch will try to delete some useless comments. Reviewed-by: Dmitry Monakhov Signed-off-by: Xiaoguang Wang Signed-off-by: Theodore Ts'o --- fs/ext4/move_extent.c | 59 ++++++--------------------------------------------- 1 file changed, 6 insertions(+), 53 deletions(-) diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 5d7806390102..9f2311bc9c4f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -119,34 +119,13 @@ out: return ret; } -/** - * mext_replace_branches - Replace original extents with new extents - * - * @handle: journal handle - * @orig_inode: original inode - * @donor_inode: donor inode - * @from: block offset of orig_inode - * @count: block count to be replaced - * @err: pointer to save return value - * - * Replace original inode extents and donor inode extents page by page. - * We implement this replacement in the following three steps: - * 1. Save the block information of original and donor inodes into - * dummy extents. - * 2. Change the block information of original inode to point at the - * donor inode blocks. - * 3. Change the block information of donor inode to point at the saved - * original inode blocks in the dummy extents. - * - * Return replaced block count. - */ - /** * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure - * @index: page index + * @index1: page index + * @index2: page index * @page: result page vector * * Grab two locked pages for inode's by inode order @@ -266,13 +245,14 @@ out: * @o_filp: file structure of original file * @donor_inode: donor inode * @orig_page_offset: page index on original file + * @donor_page_offset: page index on donor file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents - * with donor inode extents by calling mext_replace_branches(). + * with donor inode extents by calling ext4_swap_extents(). * Finally, write out the saved data in new original inode blocks. Return * replaced block count. */ @@ -551,41 +531,14 @@ mext_check_arguments(struct inode *orig_inode, * * @o_filp: file structure of the original file * @d_filp: file structure of the donor file - * @orig_start: start offset in block for orig - * @donor_start: start offset in block for donor + * @orig_blk: start offset in block for orig + * @donor_blk: start offset in block for donor * @len: the number of blocks to be moved * @moved_len: moved block length * * This function returns 0 and moved block length is set in moved_len * if succeed, otherwise returns error value. * - * Note: ext4_move_extents() proceeds the following order. - * 1:ext4_move_extents() calculates the last block number of moving extent - * function by the start block number (orig_start) and the number of blocks - * to be moved (len) specified as arguments. - * If the {orig, donor}_start points a hole, the extent's start offset - * pointed by ext_cur (current extent), holecheck_path, orig_path are set - * after hole behind. - * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent - * or the ext_cur exceeds the block_end which is last logical block number. - * 3:To get the length of continues area, call mext_next_extent() - * specified with the ext_cur (initial value is holecheck_path) re-cursive, - * until find un-continuous extent, the start logical block number exceeds - * the block_end or the extent points to the last extent. - * 4:Exchange the original inode data with donor inode data - * from orig_page_offset to seq_end_page. - * The start indexes of data are specified as arguments. - * That of the original inode is orig_page_offset, - * and the donor inode is also orig_page_offset - * (To easily handle blocksize != pagesize case, the offset for the - * donor inode is block unit). - * 5:Update holecheck_path and orig_path to points a next proceeding extent, - * then returns to step 2. - * 6:Release holecheck_path, orig_path and set the len to moved_len - * which shows the number of moved blocks. - * The moved_len is useful for the command to calculate the file offset - * for starting next move extent ioctl. - * 7:Return 0 on success, or a negative error value on failure. */ int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, -- cgit v1.2.3 From 6507955c9781a75f1b085f0cf0a77b9df06f0197 Mon Sep 17 00:00:00 2001 From: Mahesh Salgaonkar Date: Fri, 10 Oct 2014 21:28:26 +0530 Subject: powerpc/powernv: Fallback to old HMI handling behavior for old firmware Recently we moved HMI handling into Linux kernel instead of taking HMI directly in OPAL. This new change is dependent on new OPAL call for HMI recovery which was introduced in newer firmware. While this new change works fine with latest OPAL firmware, we broke the HMI handling if we run newer kernel on old OPAL firmware that results in system hang. This patch fixes this issue by falling back to old HMI behavior on older OPAL firmware. This patch introduces a check for opal token OPAL_HANDLE_HMI to see if we are running on newer firmware or old firmware. On newer firmware this check would return OPAL_TOKEN_PRESENT, otherwise we are running on old firmware and fallback to old HMI behavior. Old firmware: POWER8 System Firmware Release as of today <= SV810_087 Action: Let OPAL handle HMIs Newer firmware: in development/yet to be released. Action: Let Linux host handle HMIs. This patch depends on opal check token patch posted at ppc-devel https://lists.ozlabs.org/pipermail/linuxppc-dev/2014-August/120224.html Signed-off-by: Mahesh Salgaonkar [mpe: Minor comment and printk rewording] Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index cf85ba86bf5e..34d96615db6c 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -194,6 +194,27 @@ static int __init opal_register_exception_handlers(void) * fwnmi area at 0x7000 to provide the glue space to OPAL */ glue = 0x7000; + + /* + * Check if we are running on newer firmware that exports + * OPAL_HANDLE_HMI token. If yes, then don't ask OPAL to patch + * the HMI interrupt and we catch it directly in Linux. + * + * For older firmware (i.e currently released POWER8 System Firmware + * as of today <= SV810_087), we fallback to old behavior and let OPAL + * patch the HMI vector and handle it inside OPAL firmware. + * + * For newer firmware (in development/yet to be released) we will + * start catching/handling HMI directly in Linux. + */ + if (!opal_check_token(OPAL_HANDLE_HMI)) { + pr_info("opal: Old firmware detected, OPAL handles HMIs.\n"); + opal_register_exception_handler( + OPAL_HYPERVISOR_MAINTENANCE_HANDLER, + 0, glue); + glue += 128; + } + opal_register_exception_handler(OPAL_SOFTPATCH_HANDLER, 0, glue); #endif -- cgit v1.2.3 From 2d15b9b479512f05680541acffd9acbbc831a47c Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Thu, 9 Oct 2014 16:41:28 -0700 Subject: powerpc/numa: check error return from proc_create proc_create can fail, we should check the return value and pass up the failure. Suggested-by: Michael Ellerman Signed-off-by: Nishanth Aravamudan Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 649666d5d1c2..51d707d85b2d 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1801,7 +1801,8 @@ static const struct file_operations topology_ops = { static int topology_update_init(void) { start_topology_update(); - proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops); + if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) + return -ENOMEM; return 0; } -- cgit v1.2.3 From 2d73bae12b26db6eba074b70406c707961b6cda9 Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Fri, 10 Oct 2014 09:04:49 -0700 Subject: powerpc/numa: Add ability to disable and debug topology updates We have hit a few customer issues with the topology update code (VPHN and PRRN). It would be nice to be able to debug the notifications coming from the hypervisor in both cases to the LPAR, as well as to disable responding to the notifications at boot-time, to narrow down the source of the problems. Add a basic level of such functionality, similar to the numa= command-line parameter. We already have a toggle in /proc/powerpc/topology_updates that allows run-time enabling/disabling, so the updates can be started at run-time if desired. But the bugs we've run into have occured during boot or very shortly after coming to login, and have resulted in a broken NUMA topology. Signed-off-by: Nishanth Aravamudan Signed-off-by: Michael Ellerman --- Documentation/kernel-parameters.txt | 6 ++++++ arch/powerpc/mm/numa.c | 35 ++++++++++++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 10d51c2f10d7..c536e1c74320 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3370,6 +3370,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. e.g. base its process migration decisions on it. Default is on. + topology_updates= [KNL, PPC, NUMA] + Format: {off} + Specify if the kernel should ignore (off) + topology updates sent by the hypervisor to this + LPAR. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 51d707d85b2d..177659050fa0 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -8,6 +8,8 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#define pr_fmt(fmt) "numa: " fmt + #include #include #include @@ -1153,6 +1155,22 @@ static int __init early_numa(char *p) } early_param("numa", early_numa); +static bool topology_updates_enabled = true; + +static int __init early_topology_updates(char *p) +{ + if (!p) + return 0; + + if (!strcmp(p, "off")) { + pr_info("Disabling topology updates\n"); + topology_updates_enabled = false; + } + + return 0; +} +early_param("topology_updates", early_topology_updates); + #ifdef CONFIG_MEMORY_HOTPLUG /* * Find the node associated with a hot added memory section for @@ -1539,6 +1557,9 @@ int arch_update_cpu_topology(void) struct device *dev; int weight, new_nid, i = 0; + if (!prrn_enabled && !vphn_enabled) + return 0; + weight = cpumask_weight(&cpu_associativity_changes_mask); if (!weight) return 0; @@ -1592,6 +1613,15 @@ int arch_update_cpu_topology(void) cpu = cpu_last_thread_sibling(cpu); } + pr_debug("Topology update for the following CPUs:\n"); + if (cpumask_weight(&updated_cpus)) { + for (ud = &updates[0]; ud; ud = ud->next) { + pr_debug("cpu %d moving from node %d " + "to %d\n", ud->cpu, + ud->old_nid, ud->new_nid); + } + } + /* * In cases where we have nothing to update (because the updates list * is too short or because the new topology is same as the old one), @@ -1800,7 +1830,10 @@ static const struct file_operations topology_ops = { static int topology_update_init(void) { - start_topology_update(); + /* Do not poll for changes if disabled at boot */ + if (topology_updates_enabled) + start_topology_update(); + if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops)) return -ENOMEM; -- cgit v1.2.3 From 9aa5d32ba269bec0e7eaba2697a986a7b0bc8528 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 13 Oct 2014 03:36:16 -0400 Subject: ext4: Replace open coded mdata csum feature to helper function Besides the fact that this replacement improves code readability it also protects from errors caused direct EXT4_S(sb)->s_es manipulation which may result attempt to use uninitialized csum machinery. #Testcase_BEGIN IMG=/dev/ram0 MNT=/mnt mkfs.ext4 $IMG mount $IMG $MNT #Enable feature directly on disk, on mounted fs tune2fs -O metadata_csum $IMG # Provoke metadata update, likey result in OOPS touch $MNT/test umount $MNT #Testcase_END # Replacement script @@ expression E; @@ - EXT4_HAS_RO_COMPAT_FEATURE(E, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) + ext4_has_metadata_csum(E) https://bugzilla.kernel.org/show_bug.cgi?id=82201 Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/bitmap.c | 12 ++++-------- fs/ext4/ext4.h | 8 ++++++++ fs/ext4/extents.c | 6 ++---- fs/ext4/ialloc.c | 3 +-- fs/ext4/inline.c | 3 +-- fs/ext4/inode.c | 9 +++------ fs/ext4/ioctl.c | 3 +-- fs/ext4/mmp.c | 6 ++---- fs/ext4/namei.c | 39 +++++++++++++-------------------------- fs/ext4/resize.c | 3 +-- fs/ext4/super.c | 15 +++++---------- fs/ext4/xattr.c | 6 ++---- 12 files changed, 43 insertions(+), 70 deletions(-) diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 3285aa5a706a..b610779a958c 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -24,8 +24,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); @@ -46,8 +45,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); @@ -65,8 +63,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); @@ -91,8 +88,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 012e89bd9644..1483d9c6061f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2337,6 +2337,14 @@ static inline int ext4_has_group_desc_csum(struct super_block *sb) EXT4_FEATURE_RO_COMPAT_METADATA_CSUM); } +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + !EXT4_SB(sb)->s_chksum_driver); + + return (EXT4_SB(sb)->s_chksum_driver != NULL); +} static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c3ed9af20d20..37043d0b2be8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -73,8 +73,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode, { struct ext4_extent_tail *et; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); @@ -88,8 +87,7 @@ static void ext4_extent_block_csum_set(struct inode *inode, { struct ext4_extent_tail *et; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 5b87fc36aab8..8012a5daf401 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1011,8 +1011,7 @@ got: spin_unlock(&sbi->s_next_gen_lock); /* Precompute checksum seed for inode metadata */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 378aadf5e6db..3ea62695abce 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1128,8 +1128,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0dd9150c0c04..e9777f93cf05 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -83,8 +83,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + !ext4_has_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); @@ -105,8 +104,7 @@ static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + !ext4_has_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); @@ -3928,8 +3926,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 3d5de16f028f..bfda18a15592 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -331,8 +331,7 @@ flags_out: if (!inode_owner_or_capable(inode)) return -EPERM; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 32bce844c2e1..8313ca3324ec 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -20,8 +20,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); @@ -29,8 +28,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7037ecf0fc23..61756f941ed0 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -124,8 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, "directory leaf block found instead of index block"); return ERR_PTR(-EIO); } - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) || + if (!ext4_has_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; @@ -338,8 +337,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct ext4_dir_entry_tail *t; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, dirent); @@ -360,8 +358,7 @@ static void ext4_dirent_csum_set(struct inode *inode, { struct ext4_dir_entry_tail *t; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, dirent); @@ -436,8 +433,7 @@ static int ext4_dx_csum_verify(struct inode *inode, struct dx_tail *t; int count_offset, limit, count; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -466,8 +462,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) struct dx_tail *t; int count_offset, limit, count; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -555,8 +550,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - EXT4_DIR_REC_LEN(2) - infosize; - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -565,8 +559,7 @@ static inline unsigned dx_node_limit(struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -1524,8 +1517,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, int csum_size = 0; int err = 0, i; - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); @@ -1691,8 +1683,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, int csum_size = 0; int err; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { @@ -1759,8 +1750,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct fake_dirent *fde; int csum_size = 0; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; @@ -1877,8 +1867,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_lblk_t block, blocks; int csum_size = 0; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; @@ -2142,8 +2131,7 @@ static int ext4_delete_entry(handle_t *handle, return err; } - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); @@ -2362,8 +2350,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, int csum_size = 0; int err; - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index bb0e80f03e2e..d5afb0a0cf25 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1210,8 +1210,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, { struct buffer_head *bh; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 0; bh = ext4_get_bitmap(sb, group_data->inode_bitmap); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a0811cc00c91..5afe42db303c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -140,8 +140,7 @@ static __le32 ext4_superblock_csum(struct super_block *sb, static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); @@ -151,8 +150,7 @@ void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); @@ -1989,8 +1987,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); - if ((sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { + if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __le16 save_csum; __u32 csum32; @@ -3199,8 +3196,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; @@ -3508,8 +3504,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Precompute checksum seed for all metadata */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 42823ab3718c..1e09fc77395c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -142,8 +142,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + if (ext4_has_metadata_csum(inode->i_sb) && (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) return 0; return 1; @@ -153,8 +152,7 @@ static void ext4_xattr_block_csum_set(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); -- cgit v1.2.3 From aef4885ae14f1df75b58395c5314d71f613d26d9 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 13 Oct 2014 03:42:12 -0400 Subject: ext4: move error report out of atomic context in ext4_init_block_bitmap() Error report likely result in IO so it is bad idea to do it from atomic context. This patch should fix following issue: BUG: sleeping function called from invalid context at include/linux/buffer_head.h:349 in_atomic(): 1, irqs_disabled(): 0, pid: 137, name: kworker/u128:1 5 locks held by kworker/u128:1/137: #0: ("writeback"){......}, at: [] process_one_work+0x228/0x4d0 #1: ((&(&wb->dwork)->work)){......}, at: [] process_one_work+0x228/0x4d0 #2: (jbd2_handle){......}, at: [] start_this_handle+0x712/0x7b0 #3: (&ei->i_data_sem){......}, at: [] ext4_map_blocks+0x297/0x430 #4: (&(&bgl->locks[i].lock)->rlock){......}, at: [] ext4_read_block_bitmap_nowait+0x5d0/0x630 CPU: 3 PID: 137 Comm: kworker/u128:1 Not tainted 3.17.0-rc2-00184-g82752e4 #165 Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011 Workqueue: writeback bdi_writeback_workfn (flush-1:0) 0000000000000411 ffff880813777288 ffffffff815c7fdc ffff880813777288 ffff880813a8bba0 ffff8808137772a8 ffffffff8108fb30 ffff880803e01e38 ffff880803e01e38 ffff8808137772c8 ffffffff811a8d53 ffff88080ecc6000 Call Trace: [] dump_stack+0x51/0x6d [] __might_sleep+0xf0/0x100 [] __sync_dirty_buffer+0x43/0xe0 [] sync_dirty_buffer+0x13/0x20 [] ext4_commit_super+0x1d1/0x230 [] save_error_info+0x23/0x30 [] __ext4_error+0xb6/0xd0 [] ? ext4_group_desc_csum+0x140/0x190 [] ext4_read_block_bitmap_nowait+0x1dc/0x630 [] ext4_mb_init_cache+0x21a/0x8f0 [] ? lru_cache_add+0x55/0x60 [] ? add_to_page_cache_lru+0x6c/0x80 [] ext4_mb_init_group+0x190/0x280 [] ext4_mb_good_group+0xc1/0x190 [] ext4_mb_regular_allocator+0x17a/0x410 [] ? ext4_mb_use_preallocated+0x31/0x380 [] ? ext4_mb_new_blocks+0x205/0x8e0 [] ? kmem_cache_alloc+0xfc/0x180 [] ext4_mb_new_blocks+0x280/0x8e0 [] ? __kmalloc+0x144/0x1c0 [] ? ext4_find_extent+0x97/0x320 [] ext4_ext_map_blocks+0xbc4/0x1050 [] ? ext4_map_blocks+0x297/0x430 [] ext4_map_blocks+0x2bb/0x430 [] ? ext4_init_io_end+0x23/0x50 [] ext4_writepages+0x564/0xaf0 [] ? _raw_spin_unlock+0x2b/0x40 [] ? lock_release_non_nested+0x2fd/0x3c0 [] ? writeback_sb_inodes+0x10e/0x490 [] ? writeback_sb_inodes+0x10e/0x490 [] do_writepages+0x23/0x40 [] __writeback_single_inode+0x9e/0x280 [] writeback_sb_inodes+0x2db/0x490 [] wb_writeback+0x174/0x2d0 [] ? lock_release_holdtime+0x29/0x190 [] wb_do_writeback+0xa3/0x200 [] bdi_writeback_workfn+0x80/0x230 [] ? process_one_work+0x228/0x4d0 [] process_one_work+0x2dd/0x4d0 [] ? process_one_work+0x228/0x4d0 [] worker_thread+0x35d/0x460 [] ? process_one_work+0x4d0/0x4d0 [] ? process_one_work+0x4d0/0x4d0 [] kthread+0xf5/0x100 [] ? local_clock+0x25/0x30 [] ? __init_kthread_worker+0x70/0x70 [] ret_from_fork+0x7c/0xb0 [] ? __init_kthread_work Signed-off-by: Dmitry Monakhov Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- fs/ext4/balloc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index d70f154f6da3..83a6f497c4e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -176,7 +176,7 @@ static unsigned int num_clusters_in_group(struct super_block *sb, } /* Initializes an uninitialized block bitmap */ -static void ext4_init_block_bitmap(struct super_block *sb, +static int ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) @@ -192,7 +192,6 @@ static void ext4_init_block_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -205,7 +204,7 @@ static void ext4_init_block_bitmap(struct super_block *sb, count); } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return; + return -EIO; } memset(bh->b_data, 0, sb->s_blocksize); @@ -243,6 +242,7 @@ static void ext4_init_block_bitmap(struct super_block *sb, sb->s_blocksize * 8, bh->b_data); ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); ext4_group_desc_csum_set(sb, block_group, gdp); + return 0; } /* Return the number of free blocks in a block group. It is used when @@ -438,11 +438,15 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - ext4_init_block_bitmap(sb, bh, block_group, desc); + int err; + + err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); + if (err) + ext4_error(sb, "Checksum bad for grp %u", block_group); return bh; } ext4_unlock_group(sb, block_group); -- cgit v1.2.3 From 70e956483efd8a70e86fb2260dcd2395eb1affef Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 10 Sep 2014 11:08:39 +0530 Subject: ARC: [arcfpga] consolidate machine description, DT * AA4/ML509 have same machine descriptions * Rename simulation machine description Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/angel4.dts | 5 ----- arch/arc/boot/dts/nsimosci.dts | 5 ----- arch/arc/plat-arcfpga/platform.c | 27 +++++++-------------------- 3 files changed, 7 insertions(+), 30 deletions(-) diff --git a/arch/arc/boot/dts/angel4.dts b/arch/arc/boot/dts/angel4.dts index 6b57475967a6..757e0c62c4f9 100644 --- a/arch/arc/boot/dts/angel4.dts +++ b/arch/arc/boot/dts/angel4.dts @@ -24,11 +24,6 @@ serial0 = &arcuart0; }; - memory { - device_type = "memory"; - reg = <0x00000000 0x10000000>; /* 256M */ - }; - fpga { compatible = "simple-bus"; #address-cells = <1>; diff --git a/arch/arc/boot/dts/nsimosci.dts b/arch/arc/boot/dts/nsimosci.dts index 4f31b2eb5cdf..825065d66d65 100644 --- a/arch/arc/boot/dts/nsimosci.dts +++ b/arch/arc/boot/dts/nsimosci.dts @@ -27,11 +27,6 @@ serial0 = &uart0; }; - memory { - device_type = "memory"; - reg = <0x80000000 0x10000000>; /* 256M */ - }; - fpga { compatible = "simple-bus"; #address-cells = <1>; diff --git a/arch/arc/plat-arcfpga/platform.c b/arch/arc/plat-arcfpga/platform.c index 6abc341e276d..57bd24f6a4bd 100644 --- a/arch/arc/plat-arcfpga/platform.c +++ b/arch/arc/plat-arcfpga/platform.c @@ -48,27 +48,14 @@ static void __init plat_fpga_populate_dev(void) * callback set, by matching the DT compatible name. */ -static const char *aa4_compat[] __initconst = { +static const char *legacy_fpga_compat[] __initconst = { "snps,arc-angel4", - NULL, -}; - -MACHINE_START(ANGEL4, "angel4") - .dt_compat = aa4_compat, - .init_early = plat_fpga_early_init, - .init_machine = plat_fpga_populate_dev, -#ifdef CONFIG_ISS_SMP_EXTN - .init_smp = iss_model_init_smp, -#endif -MACHINE_END - -static const char *ml509_compat[] __initconst = { "snps,arc-ml509", NULL, }; -MACHINE_START(ML509, "ml509") - .dt_compat = ml509_compat, +MACHINE_START(LEGACY_FPGA, "legacy_fpga") + .dt_compat = legacy_fpga_compat, .init_early = plat_fpga_early_init, .init_machine = plat_fpga_populate_dev, #ifdef CONFIG_ISS_SMP_EXTN @@ -76,13 +63,13 @@ MACHINE_START(ML509, "ml509") #endif MACHINE_END -static const char *nsimosci_compat[] __initconst = { +static const char *simulation_compat[] __initconst = { + "snps,nsim", "snps,nsimosci", NULL, }; -MACHINE_START(NSIMOSCI, "nsimosci") - .dt_compat = nsimosci_compat, - .init_early = NULL, +MACHINE_START(SIMULATION, "simulation") + .dt_compat = simulation_compat, .init_machine = plat_fpga_populate_dev, MACHINE_END -- cgit v1.2.3 From d7f8a085d4f48501b1fa253b48ec4ad7cb4d02cc Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 10 Sep 2014 11:10:54 +0530 Subject: ARC: [plat*] move code out of .init_machine into common All the platforms do the same thing in init_machine callback so move it out of callback into caller of callback Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 1 + arch/arc/kernel/setup.c | 10 +++++++++- arch/arc/plat-arcfpga/platform.c | 12 ------------ arch/arc/plat-tb10x/Kconfig | 1 - arch/arc/plat-tb10x/tb10x.c | 13 ------------- 5 files changed, 10 insertions(+), 27 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 9596b0ab108d..bf1e410ee448 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -9,6 +9,7 @@ config ARC def_bool y select BUILDTIME_EXTABLE_SORT + select COMMON_CLK select CLONE_BACKWARDS # ARC Busybox based initramfs absolutely relies on DEVTMPFS for /dev select DEVTMPFS if !INITRAMFS_SOURCE="" diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index da61f2205dc5..686e3fae4420 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -13,7 +13,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -379,7 +381,13 @@ void __init setup_arch(char **cmdline_p) static int __init customize_machine(void) { - /* Add platform devices */ + of_clk_init(NULL); + /* + * Traverses flattened DeviceTree - registering platform devices + * (if any) complete with their resources + */ + of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); + if (machine_desc->init_machine) machine_desc->init_machine(); diff --git a/arch/arc/plat-arcfpga/platform.c b/arch/arc/plat-arcfpga/platform.c index 57bd24f6a4bd..b23d5fc59a48 100644 --- a/arch/arc/plat-arcfpga/platform.c +++ b/arch/arc/plat-arcfpga/platform.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -31,15 +30,6 @@ static void __init plat_fpga_early_init(void) #endif } -static void __init plat_fpga_populate_dev(void) -{ - /* - * Traverses flattened DeviceTree - registering platform devices - * (if any) complete with their resources - */ - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} - /*----------------------- Machine Descriptions ------------------------------ * * Machine description is simply a set of platform/board specific callbacks @@ -57,7 +47,6 @@ static const char *legacy_fpga_compat[] __initconst = { MACHINE_START(LEGACY_FPGA, "legacy_fpga") .dt_compat = legacy_fpga_compat, .init_early = plat_fpga_early_init, - .init_machine = plat_fpga_populate_dev, #ifdef CONFIG_ISS_SMP_EXTN .init_smp = iss_model_init_smp, #endif @@ -71,5 +60,4 @@ static const char *simulation_compat[] __initconst = { MACHINE_START(SIMULATION, "simulation") .dt_compat = simulation_compat, - .init_machine = plat_fpga_populate_dev, MACHINE_END diff --git a/arch/arc/plat-tb10x/Kconfig b/arch/arc/plat-tb10x/Kconfig index 6994c188dc88..d14b3d3c5dfd 100644 --- a/arch/arc/plat-tb10x/Kconfig +++ b/arch/arc/plat-tb10x/Kconfig @@ -18,7 +18,6 @@ menuconfig ARC_PLAT_TB10X bool "Abilis TB10x" - select COMMON_CLK select PINCTRL select PINCTRL_TB10X select PINMUX diff --git a/arch/arc/plat-tb10x/tb10x.c b/arch/arc/plat-tb10x/tb10x.c index 06cb30929460..da0ac0960a4b 100644 --- a/arch/arc/plat-tb10x/tb10x.c +++ b/arch/arc/plat-tb10x/tb10x.c @@ -19,21 +19,9 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ - #include -#include -#include -#include - #include - -static void __init tb10x_platform_init(void) -{ - of_clk_init(NULL); - of_platform_populate(NULL, of_default_bus_match_table, NULL, NULL); -} - static const char *tb10x_compat[] __initdata = { "abilis,arc-tb10x", NULL, @@ -41,5 +29,4 @@ static const char *tb10x_compat[] __initdata = { MACHINE_START(TB10x, "tb10x") .dt_compat = tb10x_compat, - .init_machine = tb10x_platform_init, MACHINE_END -- cgit v1.2.3 From 72f933e77cfc8c7e38e7214fd774806eb7ecc038 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 10 Sep 2014 11:19:28 +0530 Subject: ARC: [arcfpga] Remove more dead code specifically after switching to generic early arc uart, whole bunch of code is no longer needed Signed-off-by: Vineet Gupta --- arch/arc/plat-arcfpga/include/plat/irq.h | 27 --------------------------- arch/arc/plat-arcfpga/include/plat/memmap.h | 29 ----------------------------- arch/arc/plat-arcfpga/platform.c | 20 +------------------- arch/arc/plat-arcfpga/smp.c | 3 ++- 4 files changed, 3 insertions(+), 76 deletions(-) delete mode 100644 arch/arc/plat-arcfpga/include/plat/irq.h delete mode 100644 arch/arc/plat-arcfpga/include/plat/memmap.h diff --git a/arch/arc/plat-arcfpga/include/plat/irq.h b/arch/arc/plat-arcfpga/include/plat/irq.h deleted file mode 100644 index 2c9dea690ac4..000000000000 --- a/arch/arc/plat-arcfpga/include/plat/irq.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * vineetg: Feb 2009 - * -For AA4 board, IRQ assignments to peripherals - */ - -#ifndef __PLAT_IRQ_H -#define __PLAT_IRQ_H - -#define UART0_IRQ 5 -#define UART1_IRQ 10 -#define UART2_IRQ 11 - -#define IDE_IRQ 13 -#define PCI_IRQ 14 -#define PS2_IRQ 15 - -#ifdef CONFIG_SMP -#define IDU_INTERRUPT_0 16 -#endif - -#endif diff --git a/arch/arc/plat-arcfpga/include/plat/memmap.h b/arch/arc/plat-arcfpga/include/plat/memmap.h deleted file mode 100644 index 5c78e6135a1f..000000000000 --- a/arch/arc/plat-arcfpga/include/plat/memmap.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * vineetg: Feb 2009 - * -For AA4 board, System Memory Map for Peripherals etc - */ - -#ifndef __PLAT_MEMMAP_H -#define __PLAT_MEMMAP_H - -#define UART0_BASE 0xC0FC1000 -#define UART1_BASE 0xC0FC1100 - -#define IDE_CONTROLLER_BASE 0xC0FC9000 - -#define AHB_PCI_HOST_BRG_BASE 0xC0FD0000 - -#define PGU_BASEADDR 0xC0FC8000 -#define VLCK_ADDR 0xC0FCF028 - -#define BVCI_LAT_UNIT_BASE 0xC0FED000 - -#define PS2_BASE_ADDR 0xC0FCC000 - -#endif diff --git a/arch/arc/plat-arcfpga/platform.c b/arch/arc/plat-arcfpga/platform.c index b23d5fc59a48..afc88254acc1 100644 --- a/arch/arc/plat-arcfpga/platform.c +++ b/arch/arc/plat-arcfpga/platform.c @@ -8,27 +8,9 @@ * published by the Free Software Foundation. */ -#include #include -#include -#include -#include -#include -#include -#include #include -#include #include -#include - -static void __init plat_fpga_early_init(void) -{ - pr_info("[plat-arcfpga]: registering early dev resources\n"); - -#ifdef CONFIG_ISS_SMP_EXTN - iss_model_init_early_smp(); -#endif -} /*----------------------- Machine Descriptions ------------------------------ * @@ -46,8 +28,8 @@ static const char *legacy_fpga_compat[] __initconst = { MACHINE_START(LEGACY_FPGA, "legacy_fpga") .dt_compat = legacy_fpga_compat, - .init_early = plat_fpga_early_init, #ifdef CONFIG_ISS_SMP_EXTN + .init_early = iss_model_init_early_smp, .init_smp = iss_model_init_smp, #endif MACHINE_END diff --git a/arch/arc/plat-arcfpga/smp.c b/arch/arc/plat-arcfpga/smp.c index 92bad9122077..64797ba3bbe3 100644 --- a/arch/arc/plat-arcfpga/smp.c +++ b/arch/arc/plat-arcfpga/smp.c @@ -13,9 +13,10 @@ #include #include -#include #include +#define IDU_INTERRUPT_0 16 + static char smp_cpuinfo_buf[128]; /* -- cgit v1.2.3 From b4c43b4908fb53c6cd00281f1b19cc54d7791299 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 23 Sep 2014 11:39:04 +0200 Subject: ARC: [arcfpga] Get rid of ARC_BOARD_ANGEL4 and ARC_BOARD_ML509 Commit c00bfd974fb0 ("ARC: [arcfpga] Get rid of legacy BVCI latency unit support") removed the Kconfig symbol ARC_HAS_BVCI_LAT_UNIT. And that symbol's entry was the only place were the symbols ARC_BOARD_ANGEL4 and ARC_BOARD_ML509 were used. So ARC_BOARD_ANGEL4 and ARC_BOARD_ML509 can be removed too. Signed-off-by: Paul Bolle Signed-off-by: Vineet Gupta --- arch/arc/configs/fpga_defconfig | 1 - arch/arc/configs/fpga_noramfs_defconfig | 1 - arch/arc/configs/nsimosci_defconfig | 1 - arch/arc/plat-arcfpga/Kconfig | 11 ----------- 4 files changed, 14 deletions(-) diff --git a/arch/arc/configs/fpga_defconfig b/arch/arc/configs/fpga_defconfig index e283aa586934..ef4d3bc7b6c0 100644 --- a/arch/arc/configs/fpga_defconfig +++ b/arch/arc/configs/fpga_defconfig @@ -23,7 +23,6 @@ CONFIG_MODULES=y # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set CONFIG_ARC_PLAT_FPGA_LEGACY=y -CONFIG_ARC_BOARD_ML509=y # CONFIG_ARC_HAS_RTSC is not set CONFIG_ARC_BUILTIN_DTB_NAME="angel4" CONFIG_PREEMPT=y diff --git a/arch/arc/configs/fpga_noramfs_defconfig b/arch/arc/configs/fpga_noramfs_defconfig index 5276a52f6a2f..49c93011ab96 100644 --- a/arch/arc/configs/fpga_noramfs_defconfig +++ b/arch/arc/configs/fpga_noramfs_defconfig @@ -20,7 +20,6 @@ CONFIG_MODULES=y # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set CONFIG_ARC_PLAT_FPGA_LEGACY=y -CONFIG_ARC_BOARD_ML509=y # CONFIG_ARC_HAS_RTSC is not set CONFIG_ARC_BUILTIN_DTB_NAME="angel4" CONFIG_PREEMPT=y diff --git a/arch/arc/configs/nsimosci_defconfig b/arch/arc/configs/nsimosci_defconfig index c01ba35a4eff..278dacf2a3f9 100644 --- a/arch/arc/configs/nsimosci_defconfig +++ b/arch/arc/configs/nsimosci_defconfig @@ -21,7 +21,6 @@ CONFIG_MODULES=y # CONFIG_IOSCHED_DEADLINE is not set # CONFIG_IOSCHED_CFQ is not set CONFIG_ARC_PLAT_FPGA_LEGACY=y -CONFIG_ARC_BOARD_ML509=y # CONFIG_ARC_IDE is not set # CONFIG_ARCTANGENT_EMAC is not set # CONFIG_ARC_HAS_RTSC is not set diff --git a/arch/arc/plat-arcfpga/Kconfig b/arch/arc/plat-arcfpga/Kconfig index 4965f9f4ffdc..217593a70751 100644 --- a/arch/arc/plat-arcfpga/Kconfig +++ b/arch/arc/plat-arcfpga/Kconfig @@ -18,17 +18,6 @@ menuconfig ARC_PLAT_FPGA_LEGACY if ARC_PLAT_FPGA_LEGACY -config ARC_BOARD_ANGEL4 - bool "ARC Angel4" - default y - help - ARC Angel4 FPGA Ref Platform (Xilinx Virtex Based) - -config ARC_BOARD_ML509 - bool "ML509" - help - ARC ML509 FPGA Ref Platform (Xilinx Virtex-5 Based) - config ISS_SMP_EXTN bool "ARC SMP Extensions (ISS Models only)" default n -- cgit v1.2.3 From 5c05483e2db91890faa9a7be0a831701a3f442d6 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 20 Jun 2014 16:24:49 +0530 Subject: ARC: [nsimosci] Allow "headless" models to boot There are certain test configuration of virtual platform which don't have any real console device (uart/pgu). So add tty0 as a fallback console device to allow system to boot and be accessible via telnet Otherwise with ttyS0 as only console, but 8250 disabled in kernel build, init chokes. Reported-by: Anton Kolesov Signed-off-by: Vineet Gupta Cc: #3.10, 3.12, 3.14, 3.16 --- arch/arc/boot/dts/nsimosci.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/boot/dts/nsimosci.dts b/arch/arc/boot/dts/nsimosci.dts index 825065d66d65..cfaedd9c61c9 100644 --- a/arch/arc/boot/dts/nsimosci.dts +++ b/arch/arc/boot/dts/nsimosci.dts @@ -20,7 +20,7 @@ /* this is for console on PGU */ /* bootargs = "console=tty0 consoleblank=0"; */ /* this is for console on serial */ - bootargs = "earlycon=uart8250,mmio32,0xc0000000,115200n8 console=ttyS0,115200n8 consoleblank=0 debug"; + bootargs = "earlycon=uart8250,mmio32,0xc0000000,115200n8 console=tty0 console=ttyS0,115200n8 consoleblank=0 debug"; }; aliases { -- cgit v1.2.3 From 1736a56f3d1d5765fa8953d39a900a494d7e415c Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 8 Sep 2014 11:18:15 +0530 Subject: ARC: rename kconfig option for unaligned emulation Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 2 +- arch/arc/include/asm/unaligned.h | 2 +- arch/arc/kernel/Makefile | 2 +- arch/arc/kernel/disasm.c | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index bf1e410ee448..cdb6b7596c98 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -355,7 +355,7 @@ config ARC_CURR_IN_REG kernel mode. This saves memory access for each such access -config ARC_MISALIGN_ACCESS +config ARC_EMUL_UNALIGNED bool "Emulate unaligned memory access (userspace only)" select SYSCTL_ARCH_UNALIGN_NO_WARN select SYSCTL_ARCH_UNALIGN_ALLOW diff --git a/arch/arc/include/asm/unaligned.h b/arch/arc/include/asm/unaligned.h index 3e5f071bc00c..6da6b4edaeda 100644 --- a/arch/arc/include/asm/unaligned.h +++ b/arch/arc/include/asm/unaligned.h @@ -14,7 +14,7 @@ #include #include -#ifdef CONFIG_ARC_MISALIGN_ACCESS +#ifdef CONFIG_ARC_EMUL_UNALIGNED int misaligned_fixup(unsigned long address, struct pt_regs *regs, struct callee_regs *cregs); #else diff --git a/arch/arc/kernel/Makefile b/arch/arc/kernel/Makefile index 8004b4fa6461..113f2033da9f 100644 --- a/arch/arc/kernel/Makefile +++ b/arch/arc/kernel/Makefile @@ -16,7 +16,7 @@ obj-$(CONFIG_MODULES) += arcksyms.o module.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_ARC_DW2_UNWIND) += unwind.o obj-$(CONFIG_KPROBES) += kprobes.o -obj-$(CONFIG_ARC_MISALIGN_ACCESS) += unaligned.o +obj-$(CONFIG_ARC_EMUL_UNALIGNED) += unaligned.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_ARC_METAWARE_HLINK) += arc_hostlink.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o diff --git a/arch/arc/kernel/disasm.c b/arch/arc/kernel/disasm.c index b8a549c4f540..3b7cd4864ba2 100644 --- a/arch/arc/kernel/disasm.c +++ b/arch/arc/kernel/disasm.c @@ -15,7 +15,7 @@ #include #include -#if defined(CONFIG_KGDB) || defined(CONFIG_ARC_MISALIGN_ACCESS) || \ +#if defined(CONFIG_KGDB) || defined(CONFIG_ARC_EMUL_UNALIGNED) || \ defined(CONFIG_KPROBES) /* disasm_instr: Analyses instruction at addr, stores @@ -535,4 +535,4 @@ int __kprobes disasm_next_pc(unsigned long pc, struct pt_regs *regs, return instr.is_branch; } -#endif /* CONFIG_KGDB || CONFIG_ARC_MISALIGN_ACCESS || CONFIG_KPROBES */ +#endif /* CONFIG_KGDB || CONFIG_ARC_EMUL_UNALIGNED || CONFIG_KPROBES */ -- cgit v1.2.3 From c4aa49df4dca6d41d3a7488cf582a0ab778ad06d Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 19 Sep 2014 01:28:24 +0530 Subject: ARC: Update comments about uncached address space Suggested-by: Noam Camus Signed-off-by: Vineet Gupta --- arch/arc/include/asm/cache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index b3c750979aa1..7861255da32d 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -20,7 +20,7 @@ #define CACHE_LINE_MASK (~(L1_CACHE_BYTES - 1)) /* - * ARC700 doesn't cache any access in top 256M. + * ARC700 doesn't cache any access in top 1G (0xc000_0000 to 0xFFFF_FFFF) * Ideal for wiring memory mapped peripherals as we don't need to do * explicit uncached accesses (LD.di/ST.di) hence more portable drivers */ -- cgit v1.2.3 From 435abb6daf310b4c1d681097887ea7239c310732 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 10 Sep 2014 10:38:10 +0530 Subject: ARC: RIP @running_on_hw * No active users of this flag anymore * flag itself was no longer usable with new simualtor which acts just like hardware, not providing the special chip-id = 0xffff which good old ISS used to do. Signed-off-by: Vineet Gupta --- arch/arc/include/asm/setup.h | 1 - arch/arc/kernel/head.S | 10 ---------- arch/arc/kernel/setup.c | 2 -- 3 files changed, 13 deletions(-) diff --git a/arch/arc/include/asm/setup.h b/arch/arc/include/asm/setup.h index e10f8cef56a8..6e3ef5ba4f74 100644 --- a/arch/arc/include/asm/setup.h +++ b/arch/arc/include/asm/setup.h @@ -29,7 +29,6 @@ struct cpuinfo_data { }; extern int root_mountflags, end_mem; -extern int running_on_hw; void setup_processor(void); void __init setup_arch_memory(void); diff --git a/arch/arc/kernel/head.S b/arch/arc/kernel/head.S index 4d2481bd8b98..b0e8666fdccc 100644 --- a/arch/arc/kernel/head.S +++ b/arch/arc/kernel/head.S @@ -91,16 +91,6 @@ stext: st r0, [@uboot_tag] st r2, [@uboot_arg] - ; Identify if running on ISS vs Silicon - ; IDENTITY Reg [ 3 2 1 0 ] - ; (chip-id) ^^^^^ ==> 0xffff for ISS - lr r0, [identity] - lsr r3, r0, 16 - cmp r3, 0xffff - mov.z r4, 0 - mov.nz r4, 1 - st r4, [@running_on_hw] - ; setup "current" tsk and optionally cache it in dedicated r25 mov r9, @init_task SET_CURR_TASK_ON_CPU r9, r0 ; r9 = tsk, r0 = scratch diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 686e3fae4420..8fccf0328ad0 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -30,8 +30,6 @@ #define FIX_PTR(x) __asm__ __volatile__(";" : "+r"(x)) -int running_on_hw = 1; /* vs. on ISS */ - /* Part of U-boot ABI: see head.S */ int __initdata uboot_tag; char __initdata *uboot_arg; -- cgit v1.2.3 From 68d8feee85dd9ff68a2d740c90b08dc4ae2fb8e5 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 7 Aug 2014 16:24:47 -0700 Subject: ARC: remove gcc mpy heuristics Signed-off-by: Vineet Gupta --- arch/arc/Makefile | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/arch/arc/Makefile b/arch/arc/Makefile index 8c0b1aa56f7e..10bc3d4e8a44 100644 --- a/arch/arc/Makefile +++ b/arch/arc/Makefile @@ -25,7 +25,6 @@ ifdef CONFIG_ARC_CURR_IN_REG LINUXINCLUDE += -include ${src}/arch/arc/include/asm/current.h endif -upto_gcc42 := $(call cc-ifversion, -le, 0402, y) upto_gcc44 := $(call cc-ifversion, -le, 0404, y) atleast_gcc44 := $(call cc-ifversion, -ge, 0404, y) atleast_gcc48 := $(call cc-ifversion, -ge, 0408, y) @@ -60,25 +59,11 @@ ldflags-$(CONFIG_CPU_BIG_ENDIAN) += -EB # --build-id w/o "-marclinux". Default arc-elf32-ld is OK ldflags-$(upto_gcc44) += -marclinux -ARC_LIBGCC := -mA7 -cflags-$(CONFIG_ARC_HAS_HW_MPY) += -multcost=16 - ifndef CONFIG_ARC_HAS_HW_MPY cflags-y += -mno-mpy - -# newlib for ARC700 assumes MPY to be always present, which is generally true -# However, if someone really doesn't want MPY, we need to use the 600 ver -# which coupled with -mno-mpy will use mpy emulation -# With gcc 4.4.7, -mno-mpy is enough to make any other related adjustments, -# e.g. increased cost of MPY. With gcc 4.2.1 this had to be explicitly hinted - - ifeq ($(upto_gcc42),y) - ARC_LIBGCC := -marc600 - cflags-y += -multcost=30 - endif endif -LIBGCC := $(shell $(CC) $(ARC_LIBGCC) $(cflags-y) --print-libgcc-file-name) +LIBGCC := $(shell $(CC) $(cflags-y) --print-libgcc-file-name) # Modules with short calls might break for calls into builtin-kernel KBUILD_CFLAGS_MODULE += -mlong-calls -- cgit v1.2.3 From c59414cca188e561d450b77e44ad281579946f18 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 24 Sep 2014 11:36:20 +0530 Subject: ARC: refactoring: reduce the scope of some local vars Signed-off-by: Vineet Gupta --- arch/arc/mm/cache_arc700.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/arc/mm/cache_arc700.c b/arch/arc/mm/cache_arc700.c index 9e1142729fd1..8c3a3e02ba92 100644 --- a/arch/arc/mm/cache_arc700.c +++ b/arch/arc/mm/cache_arc700.c @@ -530,16 +530,9 @@ EXPORT_SYMBOL(dma_cache_wback); */ void flush_icache_range(unsigned long kstart, unsigned long kend) { - unsigned int tot_sz, off, sz; - unsigned long phy, pfn; + unsigned int tot_sz; - /* printk("Kernel Cache Cohenercy: %lx to %lx\n",kstart, kend); */ - - /* This is not the right API for user virtual address */ - if (kstart < TASK_SIZE) { - BUG_ON("Flush icache range for user virtual addr space"); - return; - } + WARN(kstart < TASK_SIZE, "%s() can't handle user vaddr", __func__); /* Shortcut for bigger flush ranges. * Here we don't care if this was kernel virtual or phy addr @@ -572,6 +565,9 @@ void flush_icache_range(unsigned long kstart, unsigned long kend) * straddles across 2 virtual pages and hence need for loop */ while (tot_sz > 0) { + unsigned int off, sz; + unsigned long phy, pfn; + off = kstart % PAGE_SIZE; pfn = vmalloc_to_pfn((void *)kstart); phy = (pfn << PAGE_SHIFT) + off; -- cgit v1.2.3 From 3872d05299b5ab58446f484df18f71cab4628c50 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 24 Sep 2014 16:36:48 +0530 Subject: ARC: BUG() dumps stack after @msg (@msg now same as in generic BUG)) ARC specific version (doesn't panic) still makes sense so that generic code calling BUG doesn't panic and helps debugging more Signed-off-by: Vineet Gupta --- arch/arc/include/asm/bug.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/arc/include/asm/bug.h b/arch/arc/include/asm/bug.h index 5b18e94c6678..ea022d47896c 100644 --- a/arch/arc/include/asm/bug.h +++ b/arch/arc/include/asm/bug.h @@ -21,10 +21,9 @@ void show_kernel_fault_diag(const char *str, struct pt_regs *regs, unsigned long address); void die(const char *str, struct pt_regs *regs, unsigned long address); -#define BUG() do { \ - dump_stack(); \ - pr_warn("Kernel BUG in %s: %s: %d!\n", \ - __FILE__, __func__, __LINE__); \ +#define BUG() do { \ + pr_warn("BUG: failure at %s:%d/%s()!\n", __FILE__, __LINE__, __func__); \ + dump_stack(); \ } while (0) #define HAVE_ARCH_BUG -- cgit v1.2.3 From cdd4552686b5225047ce24b8449380e02c1481e1 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Wed, 24 Sep 2014 20:32:22 +0200 Subject: ARC: Remove unneeded Kconfig entry NO_DMA Architectures only need a Kconfig entry for NO_DMA if it is possible that its value will be 'y'. For arc its value will always be 'n', making it pointless. Remove it. Signed-off-by: Paul Bolle Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index cdb6b7596c98..fe44b2494609 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -74,9 +74,6 @@ config STACKTRACE_SUPPORT config HAVE_LATENCYTOP_SUPPORT def_bool y -config NO_DMA - def_bool n - source "init/Kconfig" source "kernel/Kconfig.freezer" -- cgit v1.2.3 From ebc0c74e76cec9c4dd860eb0ca1c0b39dc63c482 Mon Sep 17 00:00:00 2001 From: Anton Kolesov Date: Thu, 25 Sep 2014 13:23:24 +0400 Subject: ARC: Update order of registers in KGDB to match GDB 7.5 Order of registers has changed in GDB moving from 6.8 to 7.5. This patch updates KGDB to work properly with GDB 7.5, though makes it incompatible with 6.8. Signed-off-by: Anton Kolesov Signed-off-by: Vineet Gupta Cc: #3.10, 3.12, 3.14, 3.16 --- arch/arc/include/asm/kgdb.h | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/arc/include/asm/kgdb.h b/arch/arc/include/asm/kgdb.h index b65fca7ffeb5..fea931634136 100644 --- a/arch/arc/include/asm/kgdb.h +++ b/arch/arc/include/asm/kgdb.h @@ -19,7 +19,7 @@ * register API yet */ #undef DBG_MAX_REG_NUM -#define GDB_MAX_REGS 39 +#define GDB_MAX_REGS 87 #define BREAK_INSTR_SIZE 2 #define CACHE_FLUSH_IS_SAFE 1 @@ -33,23 +33,27 @@ static inline void arch_kgdb_breakpoint(void) extern void kgdb_trap(struct pt_regs *regs); -enum arc700_linux_regnums { +/* This is the numbering of registers according to the GDB. See GDB's + * arc-tdep.h for details. + * + * Registers are ordered for GDB 7.5. It is incompatible with GDB 6.8. */ +enum arc_linux_regnums { _R0 = 0, _R1, _R2, _R3, _R4, _R5, _R6, _R7, _R8, _R9, _R10, _R11, _R12, _R13, _R14, _R15, _R16, _R17, _R18, _R19, _R20, _R21, _R22, _R23, _R24, _R25, _R26, - _BTA = 27, - _LP_START = 28, - _LP_END = 29, - _LP_COUNT = 30, - _STATUS32 = 31, - _BLINK = 32, - _FP = 33, - __SP = 34, - _EFA = 35, - _RET = 36, - _ORIG_R8 = 37, - _STOP_PC = 38 + _FP = 27, + __SP = 28, + _R30 = 30, + _BLINK = 31, + _LP_COUNT = 60, + _STOP_PC = 64, + _RET = 64, + _LP_START = 65, + _LP_END = 66, + _STATUS32 = 67, + _ECR = 76, + _BTA = 82, }; #else -- cgit v1.2.3 From be64c997d96dd29029ed40d9df9cbf80fa1c7ed4 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Sat, 27 Sep 2014 12:49:11 +0530 Subject: ARC: remove extraneous __KERNEL__ guards Verified by doing make headers_install as none of these files are exported to userspace --- arch/arc/include/asm/arcregs.h | 4 ---- arch/arc/include/asm/atomic.h | 4 ---- arch/arc/include/asm/bitops.h | 4 ---- arch/arc/include/asm/current.h | 4 ---- arch/arc/include/asm/irqflags.h | 4 ---- arch/arc/include/asm/processor.h | 4 ---- arch/arc/include/asm/string.h | 3 --- arch/arc/include/asm/syscalls.h | 4 ---- arch/arc/include/asm/thread_info.h | 4 ---- 9 files changed, 35 deletions(-) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index 372466b371bf..d6d82577cabe 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -9,8 +9,6 @@ #ifndef _ASM_ARC_ARCREGS_H #define _ASM_ARC_ARCREGS_H -#ifdef __KERNEL__ - /* Build Configuration Registers */ #define ARC_REG_DCCMBASE_BCR 0x61 /* DCCM Base Addr */ #define ARC_REG_CRC_BCR 0x62 @@ -321,6 +319,4 @@ extern struct cpuinfo_arc cpuinfo_arc700[]; #endif /* __ASEMBLY__ */ -#endif /* __KERNEL__ */ - #endif /* _ASM_ARC_ARCREGS_H */ diff --git a/arch/arc/include/asm/atomic.h b/arch/arc/include/asm/atomic.h index 83f03ca6caf6..6638a0392f4e 100644 --- a/arch/arc/include/asm/atomic.h +++ b/arch/arc/include/asm/atomic.h @@ -9,8 +9,6 @@ #ifndef _ASM_ARC_ATOMIC_H #define _ASM_ARC_ATOMIC_H -#ifdef __KERNEL__ - #ifndef __ASSEMBLY__ #include @@ -228,5 +226,3 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) #endif #endif - -#endif diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h index ebc0cf3164dc..1a5bf07eefe2 100644 --- a/arch/arc/include/asm/bitops.h +++ b/arch/arc/include/asm/bitops.h @@ -13,8 +13,6 @@ #error only can be included directly #endif -#ifdef __KERNEL__ - #ifndef __ASSEMBLY__ #include @@ -508,6 +506,4 @@ static inline __attribute__ ((const)) int __ffs(unsigned long word) #endif /* !__ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #endif diff --git a/arch/arc/include/asm/current.h b/arch/arc/include/asm/current.h index 87b918585c4a..c2453ee62801 100644 --- a/arch/arc/include/asm/current.h +++ b/arch/arc/include/asm/current.h @@ -12,8 +12,6 @@ #ifndef _ASM_ARC_CURRENT_H #define _ASM_ARC_CURRENT_H -#ifdef __KERNEL__ - #ifndef __ASSEMBLY__ #ifdef CONFIG_ARC_CURR_IN_REG @@ -27,6 +25,4 @@ register struct task_struct *curr_arc asm("r25"); #endif /* ! __ASSEMBLY__ */ -#endif /* __KERNEL__ */ - #endif /* _ASM_ARC_CURRENT_H */ diff --git a/arch/arc/include/asm/irqflags.h b/arch/arc/include/asm/irqflags.h index 587df8236e8b..742816f1b210 100644 --- a/arch/arc/include/asm/irqflags.h +++ b/arch/arc/include/asm/irqflags.h @@ -15,8 +15,6 @@ * -Conditionally disable interrupts (if they are not enabled, don't disable) */ -#ifdef __KERNEL__ - #include /* status32 Reg bits related to Interrupt Handling */ @@ -169,6 +167,4 @@ static inline int arch_irqs_disabled(void) #endif /* __ASSEMBLY__ */ -#endif /* KERNEL */ - #endif diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index 82588f3ba77f..08fbe2e9d4cc 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -14,8 +14,6 @@ #ifndef __ASM_ARC_PROCESSOR_H #define __ASM_ARC_PROCESSOR_H -#ifdef __KERNEL__ - #ifndef __ASSEMBLY__ #include @@ -128,6 +126,4 @@ extern unsigned int get_wchan(struct task_struct *p); */ #define TASK_UNMAPPED_BASE (TASK_SIZE / 3) -#endif /* __KERNEL__ */ - #endif /* __ASM_ARC_PROCESSOR_H */ diff --git a/arch/arc/include/asm/string.h b/arch/arc/include/asm/string.h index 87676c8f1412..95822b550a18 100644 --- a/arch/arc/include/asm/string.h +++ b/arch/arc/include/asm/string.h @@ -17,8 +17,6 @@ #include -#ifdef __KERNEL__ - #define __HAVE_ARCH_MEMSET #define __HAVE_ARCH_MEMCPY #define __HAVE_ARCH_MEMCMP @@ -36,5 +34,4 @@ extern char *strcpy(char *dest, const char *src); extern int strcmp(const char *cs, const char *ct); extern __kernel_size_t strlen(const char *); -#endif /* __KERNEL__ */ #endif /* _ASM_ARC_STRING_H */ diff --git a/arch/arc/include/asm/syscalls.h b/arch/arc/include/asm/syscalls.h index dd785befe7fd..e56f9fcc5581 100644 --- a/arch/arc/include/asm/syscalls.h +++ b/arch/arc/include/asm/syscalls.h @@ -9,8 +9,6 @@ #ifndef _ASM_ARC_SYSCALLS_H #define _ASM_ARC_SYSCALLS_H 1 -#ifdef __KERNEL__ - #include #include #include @@ -22,6 +20,4 @@ int sys_arc_gettls(void); #include -#endif /* __KERNEL__ */ - #endif diff --git a/arch/arc/include/asm/thread_info.h b/arch/arc/include/asm/thread_info.h index 45be21672011..02bc5ec0fb2e 100644 --- a/arch/arc/include/asm/thread_info.h +++ b/arch/arc/include/asm/thread_info.h @@ -16,8 +16,6 @@ #ifndef _ASM_THREAD_INFO_H #define _ASM_THREAD_INFO_H -#ifdef __KERNEL__ - #include #ifdef CONFIG_16KSTACKS @@ -114,6 +112,4 @@ static inline __attribute_const__ struct thread_info *current_thread_info(void) * syscall, so all that reamins to be tested is _TIF_WORK_MASK */ -#endif /* __KERNEL__ */ - #endif /* _ASM_THREAD_INFO_H */ -- cgit v1.2.3 From 52e9bae93802bd29c33be11e9e758ad7daac805f Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Sat, 27 Sep 2014 12:53:41 +0530 Subject: ARC: unbork FPU save/restore Fixes: 2ab402dfd65d15a4b2 "ARC: make start_thread() out-of-line" CC: #3.16 Signed-off-by: Vineet Gupta --- arch/arc/include/asm/arcregs.h | 8 -------- arch/arc/include/asm/processor.h | 9 +++++++++ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index d6d82577cabe..3ea5b437bfb9 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -189,14 +189,6 @@ #define PAGES_TO_KB(n_pages) ((n_pages) << (PAGE_SHIFT - 10)) #define PAGES_TO_MB(n_pages) (PAGES_TO_KB(n_pages) >> 10) -#ifdef CONFIG_ARC_FPU_SAVE_RESTORE -/* These DPFP regs need to be saved/restored across ctx-sw */ -struct arc_fpu { - struct { - unsigned int l, h; - } aux_dpfp[2]; -}; -#endif /* *************************************************************** diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index 08fbe2e9d4cc..210fe97464c3 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -18,6 +18,15 @@ #include +#ifdef CONFIG_ARC_FPU_SAVE_RESTORE +/* These DPFP regs need to be saved/restored across ctx-sw */ +struct arc_fpu { + struct { + unsigned int l, h; + } aux_dpfp[2]; +}; +#endif + /* Arch specific stuff which needs to be saved per task. * However these items are not so important so as to earn a place in * struct thread_info -- cgit v1.2.3 From c4b9856b5e1eb6d4f0d226f3c48be208475fc3d7 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 25 Sep 2014 16:07:44 +0530 Subject: ARC: boot: consolidate cross-checking of h/w and s/w Signed-off-by: Vineet Gupta --- arch/arc/kernel/setup.c | 44 +++++++++++++++----------------------------- 1 file changed, 15 insertions(+), 29 deletions(-) diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 8fccf0328ad0..5a10b63e2283 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -236,10 +236,10 @@ static char *arc_extn_mumbojumbo(int cpu_id, char *buf, int len) return buf; } -static void arc_chk_ccms(void) +static void arc_chk_core_config(void) { -#if defined(CONFIG_ARC_HAS_DCCM) || defined(CONFIG_ARC_HAS_ICCM) struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; + int fpu_enabled; #ifdef CONFIG_ARC_HAS_DCCM /* @@ -257,33 +257,20 @@ static void arc_chk_ccms(void) if (CONFIG_ARC_ICCM_SZ != cpu->iccm.sz) panic("Linux built with incorrect ICCM Size\n"); #endif -#endif -} -/* - * Ensure that FP hardware and kernel config match - * -If hardware contains DPFP, kernel needs to save/restore FPU state - * across context switches - * -If hardware lacks DPFP, but kernel configured to save FPU state then - * kernel trying to access non-existant DPFP regs will crash - * - * We only check for Dbl precision Floating Point, because only DPFP - * hardware has dedicated regs which need to be saved/restored on ctx-sw - * (Single Precision uses core regs), thus kernel is kind of oblivious to it - */ -static void arc_chk_fpu(void) -{ - struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; + /* + * FP hardware/software config sanity + * -If hardware contains DPFP, kernel needs to save/restore FPU state + * -If not, it will crash trying to save/restore the non-existant regs + * + * (only DPDP checked since SP has no arch visible regs) + */ + fpu_enabled = IS_ENABLED(CONFIG_ARC_FPU_SAVE_RESTORE); - if (cpu->dpfp.ver) { -#ifndef CONFIG_ARC_FPU_SAVE_RESTORE - pr_warn("DPFP support broken in this kernel...\n"); -#endif - } else { -#ifdef CONFIG_ARC_FPU_SAVE_RESTORE - panic("H/w lacks DPFP support, apps won't work\n"); -#endif - } + if (cpu->dpfp.ver && !fpu_enabled) + pr_warn("CONFIG_ARC_FPU_SAVE_RESTORE needed for working apps\n"); + else if (!cpu->dpfp.ver && fpu_enabled) + panic("FPU non-existent, disable CONFIG_ARC_FPU_SAVE_RESTORE\n"); } /* @@ -304,12 +291,11 @@ void setup_processor(void) arc_mmu_init(); arc_cache_init(); - arc_chk_ccms(); printk(arc_extn_mumbojumbo(cpu_id, str, sizeof(str))); printk(arc_platform_smp_cpuinfo()); - arc_chk_fpu(); + arc_chk_core_config(); } static inline int is_kernel(unsigned long addr) -- cgit v1.2.3 From 56372082533afb859e6d64707859349a2ee171bf Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 25 Sep 2014 16:54:43 +0530 Subject: ARC: boot: cpu feature print enhancements Signed-off-by: Vineet Gupta --- arch/arc/include/asm/arcregs.h | 77 ++++++++++----- arch/arc/kernel/perf_event.c | 22 ++--- arch/arc/kernel/setup.c | 207 ++++++++++++++++++++++------------------- arch/arc/mm/tlb.c | 8 +- 4 files changed, 176 insertions(+), 138 deletions(-) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index 3ea5b437bfb9..be33db8a2ee3 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -12,14 +12,13 @@ /* Build Configuration Registers */ #define ARC_REG_DCCMBASE_BCR 0x61 /* DCCM Base Addr */ #define ARC_REG_CRC_BCR 0x62 -#define ARC_REG_DVFB_BCR 0x64 -#define ARC_REG_EXTARITH_BCR 0x65 #define ARC_REG_VECBASE_BCR 0x68 #define ARC_REG_PERIBASE_BCR 0x69 -#define ARC_REG_FP_BCR 0x6B /* Single-Precision FPU */ -#define ARC_REG_DPFP_BCR 0x6C /* Dbl Precision FPU */ +#define ARC_REG_FP_BCR 0x6B /* ARCompact: Single-Precision FPU */ +#define ARC_REG_DPFP_BCR 0x6C /* ARCompact: Dbl Precision FPU */ #define ARC_REG_DCCM_BCR 0x74 /* DCCM Present + SZ */ #define ARC_REG_TIMERS_BCR 0x75 +#define ARC_REG_AP_BCR 0x76 #define ARC_REG_ICCM_BCR 0x78 #define ARC_REG_XY_MEM_BCR 0x79 #define ARC_REG_MAC_BCR 0x7a @@ -29,6 +28,9 @@ #define ARC_REG_MIXMAX_BCR 0x7e #define ARC_REG_BARREL_BCR 0x7f #define ARC_REG_D_UNCACH_BCR 0x6A +#define ARC_REG_BPU_BCR 0xc0 +#define ARC_REG_ISA_CFG_BCR 0xc1 +#define ARC_REG_SMART_BCR 0xFF /* status32 Bits Positions */ #define STATUS_AE_BIT 5 /* Exception active */ @@ -202,27 +204,19 @@ struct bcr_identity { #endif }; -#define EXTN_SWAP_VALID 0x1 -#define EXTN_NORM_VALID 0x2 -#define EXTN_MINMAX_VALID 0x2 -#define EXTN_BARREL_VALID 0x2 - -struct bcr_extn { +struct bcr_isa { #ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int pad:20, crc:1, ext_arith:2, mul:2, barrel:2, minmax:2, - norm:2, swap:1; + unsigned int pad1:23, atomic1:1, ver:8; #else - unsigned int swap:1, norm:2, minmax:2, barrel:2, mul:2, ext_arith:2, - crc:1, pad:20; + unsigned int ver:8, atomic1:1, pad1:23; #endif }; -/* DSP Options Ref Manual */ -struct bcr_extn_mac_mul { +struct bcr_mpy { #ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int pad:16, type:8, ver:8; + unsigned int pad:8, x1616:8, dsp:4, cycles:2, type:2, ver:8; #else - unsigned int ver:8, type:8, pad:16; + unsigned int ver:8, type:2, cycles:2, dsp:4, x1616:8, pad:8; #endif }; @@ -241,6 +235,7 @@ struct bcr_perip { unsigned int pad:8, sz:8, pad2:8, start:8; #endif }; + struct bcr_iccm { #ifdef CONFIG_CPU_BIG_ENDIAN unsigned int base:16, pad:5, sz:3, ver:8; @@ -267,8 +262,8 @@ struct bcr_dccm { #endif }; -/* Both SP and DP FPU BCRs have same format */ -struct bcr_fp { +/* ARCompact: Both SP and DP FPU BCRs have same format */ +struct bcr_fp_arcompact { #ifdef CONFIG_CPU_BIG_ENDIAN unsigned int fast:1, ver:8; #else @@ -276,6 +271,30 @@ struct bcr_fp { #endif }; +struct bcr_timer { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad2:15, rtsc:1, pad1:6, t1:1, t0:1, ver:8; +#else + unsigned int ver:8, t0:1, t1:1, pad1:6, rtsc:1, pad2:15; +#endif +}; + +struct bcr_bpu_arcompact { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad2:19, fam:1, pad:2, ent:2, ver:8; +#else + unsigned int ver:8, ent:2, pad:2, fam:1, pad2:19; +#endif +}; + +struct bcr_generic { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad:24, ver:8; +#else + unsigned int ver:8, pad:24; +#endif +}; + /* ******************************************************************* * Generic structures to hold build configuration used at runtime @@ -289,6 +308,10 @@ struct cpuinfo_arc_cache { unsigned int sz_k:8, line_len:8, assoc:4, ver:4, alias:1, vipt:1, pad:6; }; +struct cpuinfo_arc_bpu { + unsigned int ver, full, num_cache, num_pred; +}; + struct cpuinfo_arc_ccm { unsigned int base_addr, sz; }; @@ -296,15 +319,21 @@ struct cpuinfo_arc_ccm { struct cpuinfo_arc { struct cpuinfo_arc_cache icache, dcache; struct cpuinfo_arc_mmu mmu; + struct cpuinfo_arc_bpu bpu; struct bcr_identity core; - unsigned int timers; + struct bcr_isa isa; + struct bcr_timer timers; unsigned int vec_base; unsigned int uncached_base; struct cpuinfo_arc_ccm iccm, dccm; - struct bcr_extn extn; + struct { + unsigned int swap:1, norm:1, minmax:1, barrel:1, crc:1, pad1:3, + fpu_sp:1, fpu_dp:1, pad2:6, + debug:1, ap:1, smart:1, rtt:1, pad3:4, + pad4:8; + } extn; + struct bcr_mpy extn_mpy; struct bcr_extn_xymem extn_xymem; - struct bcr_extn_mac_mul extn_mac_mul; - struct bcr_fp fp, dpfp; }; extern struct cpuinfo_arc cpuinfo_arc700[]; diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index b9a5685a990e..ae1c485cbc68 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -244,25 +244,23 @@ static int arc_pmu_device_probe(struct platform_device *pdev) pr_err("This core does not have performance counters!\n"); return -ENODEV; } + BUG_ON(pct_bcr.c > ARC_PMU_MAX_HWEVENTS); - arc_pmu = devm_kzalloc(&pdev->dev, sizeof(struct arc_pmu), - GFP_KERNEL); + READ_BCR(ARC_REG_CC_BUILD, cc_bcr); + if (!cc_bcr.v) { + pr_err("Performance counters exist, but no countable conditions?\n"); + return -ENODEV; + } + + arc_pmu = devm_kzalloc(&pdev->dev, sizeof(struct arc_pmu), GFP_KERNEL); if (!arc_pmu) return -ENOMEM; arc_pmu->n_counters = pct_bcr.c; - BUG_ON(arc_pmu->n_counters > ARC_PMU_MAX_HWEVENTS); - arc_pmu->counter_size = 32 + (pct_bcr.s << 4); - pr_info("ARC PMU found with %d counters of size %d bits\n", - arc_pmu->n_counters, arc_pmu->counter_size); - - READ_BCR(ARC_REG_CC_BUILD, cc_bcr); - - if (!cc_bcr.v) - pr_err("Strange! Performance counters exist, but no countable conditions?\n"); - pr_info("ARC PMU has %d countable conditions\n", cc_bcr.c); + pr_info("ARC perf\t: %d counters (%d bits), %d countable conditions\n", + arc_pmu->n_counters, arc_pmu->counter_size, cc_bcr.c); cc_name.str[8] = 0; for (i = 0; i < PERF_COUNT_HW_MAX; i++) diff --git a/arch/arc/kernel/setup.c b/arch/arc/kernel/setup.c index 5a10b63e2283..252bf603db9c 100644 --- a/arch/arc/kernel/setup.c +++ b/arch/arc/kernel/setup.c @@ -43,26 +43,26 @@ struct cpuinfo_arc cpuinfo_arc700[NR_CPUS]; static void read_arc_build_cfg_regs(void) { struct bcr_perip uncached_space; + struct bcr_generic bcr; struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; FIX_PTR(cpu); READ_BCR(AUX_IDENTITY, cpu->core); + READ_BCR(ARC_REG_ISA_CFG_BCR, cpu->isa); - cpu->timers = read_aux_reg(ARC_REG_TIMERS_BCR); + READ_BCR(ARC_REG_TIMERS_BCR, cpu->timers); cpu->vec_base = read_aux_reg(AUX_INTR_VEC_BASE); READ_BCR(ARC_REG_D_UNCACH_BCR, uncached_space); cpu->uncached_base = uncached_space.start << 24; - cpu->extn.mul = read_aux_reg(ARC_REG_MUL_BCR); - cpu->extn.swap = read_aux_reg(ARC_REG_SWAP_BCR); - cpu->extn.norm = read_aux_reg(ARC_REG_NORM_BCR); - cpu->extn.minmax = read_aux_reg(ARC_REG_MIXMAX_BCR); - cpu->extn.barrel = read_aux_reg(ARC_REG_BARREL_BCR); - READ_BCR(ARC_REG_MAC_BCR, cpu->extn_mac_mul); + READ_BCR(ARC_REG_MUL_BCR, cpu->extn_mpy); - cpu->extn.ext_arith = read_aux_reg(ARC_REG_EXTARITH_BCR); - cpu->extn.crc = read_aux_reg(ARC_REG_CRC_BCR); + cpu->extn.norm = read_aux_reg(ARC_REG_NORM_BCR) > 1 ? 1 : 0; /* 2,3 */ + cpu->extn.barrel = read_aux_reg(ARC_REG_BARREL_BCR) > 1 ? 1 : 0; /* 2,3 */ + cpu->extn.swap = read_aux_reg(ARC_REG_SWAP_BCR) ? 1 : 0; /* 1,3 */ + cpu->extn.crc = read_aux_reg(ARC_REG_CRC_BCR) ? 1 : 0; + cpu->extn.minmax = read_aux_reg(ARC_REG_MIXMAX_BCR) > 1 ? 1 : 0; /* 2 */ /* Note that we read the CCM BCRs independent of kernel config * This is to catch the cases where user doesn't know that @@ -96,43 +96,76 @@ static void read_arc_build_cfg_regs(void) read_decode_mmu_bcr(); read_decode_cache_bcr(); - READ_BCR(ARC_REG_FP_BCR, cpu->fp); - READ_BCR(ARC_REG_DPFP_BCR, cpu->dpfp); + { + struct bcr_fp_arcompact sp, dp; + struct bcr_bpu_arcompact bpu; + + READ_BCR(ARC_REG_FP_BCR, sp); + READ_BCR(ARC_REG_DPFP_BCR, dp); + cpu->extn.fpu_sp = sp.ver ? 1 : 0; + cpu->extn.fpu_dp = dp.ver ? 1 : 0; + + READ_BCR(ARC_REG_BPU_BCR, bpu); + cpu->bpu.ver = bpu.ver; + cpu->bpu.full = bpu.fam ? 1 : 0; + if (bpu.ent) { + cpu->bpu.num_cache = 256 << (bpu.ent - 1); + cpu->bpu.num_pred = 256 << (bpu.ent - 1); + } + } + + READ_BCR(ARC_REG_AP_BCR, bcr); + cpu->extn.ap = bcr.ver ? 1 : 0; + + READ_BCR(ARC_REG_SMART_BCR, bcr); + cpu->extn.smart = bcr.ver ? 1 : 0; + + cpu->extn.debug = cpu->extn.ap | cpu->extn.smart; } static const struct cpuinfo_data arc_cpu_tbl[] = { - { {0x10, "ARCTangent A5"}, 0x1F}, { {0x20, "ARC 600" }, 0x2F}, { {0x30, "ARC 700" }, 0x33}, { {0x34, "ARC 700 R4.10"}, 0x34}, + { {0x35, "ARC 700 R4.11"}, 0x35}, { {0x00, NULL } } }; +#define IS_AVAIL1(v, str) ((v) ? str : "") +#define IS_USED(cfg) (IS_ENABLED(cfg) ? "" : "(not used) ") +#define IS_AVAIL2(v, str, cfg) IS_AVAIL1(v, str), IS_AVAIL1(v, IS_USED(cfg)) + static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) { - int n = 0; struct cpuinfo_arc *cpu = &cpuinfo_arc700[cpu_id]; struct bcr_identity *core = &cpu->core; const struct cpuinfo_data *tbl; - int be = 0; -#ifdef CONFIG_CPU_BIG_ENDIAN - be = 1; -#endif + char *isa_nm; + int i, be, atomic; + int n = 0; + FIX_PTR(cpu); + { + isa_nm = "ARCompact"; + be = IS_ENABLED(CONFIG_CPU_BIG_ENDIAN); + + atomic = cpu->isa.atomic1; + if (!cpu->isa.ver) /* ISA BCR absent, use Kconfig info */ + atomic = IS_ENABLED(CONFIG_ARC_HAS_LLSC); + } + n += scnprintf(buf + n, len - n, - "\nARC IDENTITY\t: Family [%#02x]" - " Cpu-id [%#02x] Chip-id [%#4x]\n", - core->family, core->cpu_id, - core->chip_id); + "\nIDENTITY\t: ARCVER [%#02x] ARCNUM [%#02x] CHIPID [%#4x]\n", + core->family, core->cpu_id, core->chip_id); for (tbl = &arc_cpu_tbl[0]; tbl->info.id != 0; tbl++) { if ((core->family >= tbl->info.id) && (core->family <= tbl->up_range)) { n += scnprintf(buf + n, len - n, - "processor\t: %s %s\n", - tbl->info.str, - be ? "[Big Endian]" : ""); + "processor [%d]\t: %s (%s ISA) %s\n", + cpu_id, tbl->info.str, isa_nm, + IS_AVAIL1(be, "[Big-Endian]")); break; } } @@ -144,34 +177,35 @@ static char *arc_cpu_mumbojumbo(int cpu_id, char *buf, int len) (unsigned int)(arc_get_core_freq() / 1000000), (unsigned int)(arc_get_core_freq() / 10000) % 100); - n += scnprintf(buf + n, len - n, "Timers\t\t: %s %s\n", - (cpu->timers & 0x200) ? "TIMER1" : "", - (cpu->timers & 0x100) ? "TIMER0" : ""); + n += scnprintf(buf + n, len - n, "Timers\t\t: %s%s%s%s\nISA Extn\t: ", + IS_AVAIL1(cpu->timers.t0, "Timer0 "), + IS_AVAIL1(cpu->timers.t1, "Timer1 "), + IS_AVAIL2(cpu->timers.rtsc, "64-bit RTSC ", CONFIG_ARC_HAS_RTSC)); - n += scnprintf(buf + n, len - n, "Vect Tbl Base\t: %#x\n", - cpu->vec_base); + n += i = scnprintf(buf + n, len - n, "%s%s", + IS_AVAIL2(atomic, "atomic ", CONFIG_ARC_HAS_LLSC)); - n += scnprintf(buf + n, len - n, "UNCACHED Base\t: %#x\n", - cpu->uncached_base); + if (i) + n += scnprintf(buf + n, len - n, "\n\t\t: "); - return buf; -} + n += scnprintf(buf + n, len - n, "%s%s%s%s%s%s%s%s\n", + IS_AVAIL1(cpu->extn_mpy.ver, "mpy "), + IS_AVAIL1(cpu->extn.norm, "norm "), + IS_AVAIL1(cpu->extn.barrel, "barrel-shift "), + IS_AVAIL1(cpu->extn.swap, "swap "), + IS_AVAIL1(cpu->extn.minmax, "minmax "), + IS_AVAIL1(cpu->extn.crc, "crc "), + IS_AVAIL2(1, "swape", CONFIG_ARC_HAS_SWAPE)); -static const struct id_to_str mul_type_nm[] = { - { 0x0, "N/A"}, - { 0x1, "32x32 (spl Result Reg)" }, - { 0x2, "32x32 (ANY Result Reg)" } -}; + if (cpu->bpu.ver) + n += scnprintf(buf + n, len - n, + "BPU\t\t: %s%s match, cache:%d, Predict Table:%d\n", + IS_AVAIL1(cpu->bpu.full, "full"), + IS_AVAIL1(!cpu->bpu.full, "partial"), + cpu->bpu.num_cache, cpu->bpu.num_pred); -static const struct id_to_str mac_mul_nm[] = { - {0x0, "N/A"}, - {0x1, "N/A"}, - {0x2, "Dual 16 x 16"}, - {0x3, "N/A"}, - {0x4, "32x16"}, - {0x5, "N/A"}, - {0x6, "Dual 16x16 and 32x16"} -}; + return buf; +} static char *arc_extn_mumbojumbo(int cpu_id, char *buf, int len) { @@ -179,57 +213,27 @@ static char *arc_extn_mumbojumbo(int cpu_id, char *buf, int len) struct cpuinfo_arc *cpu = &cpuinfo_arc700[cpu_id]; FIX_PTR(cpu); -#define IS_AVAIL1(var, str) ((var) ? str : "") -#define IS_AVAIL2(var, str) ((var == 0x2) ? str : "") -#define IS_USED(cfg) (IS_ENABLED(cfg) ? "(in-use)" : "(not used)") n += scnprintf(buf + n, len - n, - "Extn [700-Base]\t: %s %s %s %s %s %s\n", - IS_AVAIL2(cpu->extn.norm, "norm,"), - IS_AVAIL2(cpu->extn.barrel, "barrel-shift,"), - IS_AVAIL1(cpu->extn.swap, "swap,"), - IS_AVAIL2(cpu->extn.minmax, "minmax,"), - IS_AVAIL1(cpu->extn.crc, "crc,"), - IS_AVAIL2(cpu->extn.ext_arith, "ext-arith")); - - n += scnprintf(buf + n, len - n, "Extn [700-MPY]\t: %s", - mul_type_nm[cpu->extn.mul].str); - - n += scnprintf(buf + n, len - n, " MAC MPY: %s\n", - mac_mul_nm[cpu->extn_mac_mul.type].str); - - if (cpu->core.family == 0x34) { - n += scnprintf(buf + n, len - n, - "Extn [700-4.10]\t: LLOCK/SCOND %s, SWAPE %s, RTSC %s\n", - IS_USED(CONFIG_ARC_HAS_LLSC), - IS_USED(CONFIG_ARC_HAS_SWAPE), - IS_USED(CONFIG_ARC_HAS_RTSC)); - } - - n += scnprintf(buf + n, len - n, "Extn [CCM]\t: %s", - !(cpu->dccm.sz || cpu->iccm.sz) ? "N/A" : ""); - - if (cpu->dccm.sz) - n += scnprintf(buf + n, len - n, "DCCM: @ %x, %d KB ", - cpu->dccm.base_addr, TO_KB(cpu->dccm.sz)); - - if (cpu->iccm.sz) - n += scnprintf(buf + n, len - n, "ICCM: @ %x, %d KB", + "Vector Table\t: %#x\nUncached Base\t: %#x\n", + cpu->vec_base, cpu->uncached_base); + + if (cpu->extn.fpu_sp || cpu->extn.fpu_dp) + n += scnprintf(buf + n, len - n, "FPU\t\t: %s%s\n", + IS_AVAIL1(cpu->extn.fpu_sp, "SP "), + IS_AVAIL1(cpu->extn.fpu_dp, "DP ")); + + if (cpu->extn.debug) + n += scnprintf(buf + n, len - n, "DEBUG\t\t: %s%s%s\n", + IS_AVAIL1(cpu->extn.ap, "ActionPoint "), + IS_AVAIL1(cpu->extn.smart, "smaRT "), + IS_AVAIL1(cpu->extn.rtt, "RTT ")); + + if (cpu->dccm.sz || cpu->iccm.sz) + n += scnprintf(buf + n, len - n, "Extn [CCM]\t: DCCM @ %x, %d KB / ICCM: @ %x, %d KB\n", + cpu->dccm.base_addr, TO_KB(cpu->dccm.sz), cpu->iccm.base_addr, TO_KB(cpu->iccm.sz)); - n += scnprintf(buf + n, len - n, "\nExtn [FPU]\t: %s", - !(cpu->fp.ver || cpu->dpfp.ver) ? "N/A" : ""); - - if (cpu->fp.ver) - n += scnprintf(buf + n, len - n, "SP [v%d] %s", - cpu->fp.ver, cpu->fp.fast ? "(fast)" : ""); - - if (cpu->dpfp.ver) - n += scnprintf(buf + n, len - n, "DP [v%d] %s", - cpu->dpfp.ver, cpu->dpfp.fast ? "(fast)" : ""); - - n += scnprintf(buf + n, len - n, "\n"); - n += scnprintf(buf + n, len - n, "OS ABI [v3]\t: no-legacy-syscalls\n"); @@ -241,6 +245,15 @@ static void arc_chk_core_config(void) struct cpuinfo_arc *cpu = &cpuinfo_arc700[smp_processor_id()]; int fpu_enabled; + if (!cpu->timers.t0) + panic("Timer0 is not present!\n"); + + if (!cpu->timers.t1) + panic("Timer1 is not present!\n"); + + if (IS_ENABLED(CONFIG_ARC_HAS_RTSC) && !cpu->timers.rtsc) + panic("RTSC is not present\n"); + #ifdef CONFIG_ARC_HAS_DCCM /* * DCCM can be arbit placed in hardware. @@ -267,9 +280,9 @@ static void arc_chk_core_config(void) */ fpu_enabled = IS_ENABLED(CONFIG_ARC_FPU_SAVE_RESTORE); - if (cpu->dpfp.ver && !fpu_enabled) + if (cpu->extn.fpu_dp && !fpu_enabled) pr_warn("CONFIG_ARC_FPU_SAVE_RESTORE needed for working apps\n"); - else if (!cpu->dpfp.ver && fpu_enabled) + else if (!cpu->extn.fpu_dp && fpu_enabled) panic("FPU non-existent, disable CONFIG_ARC_FPU_SAVE_RESTORE\n"); } @@ -405,7 +418,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, arc_cpu_mumbojumbo(cpu_id, str, PAGE_SIZE)); - seq_printf(m, "Bogo MIPS : \t%lu.%02lu\n", + seq_printf(m, "Bogo MIPS\t: %lu.%02lu\n", loops_per_jiffy / (500000 / HZ), (loops_per_jiffy / (5000 / HZ)) % 100); diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index e1acf0ce5647..7f47d2a56f44 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -609,14 +609,12 @@ char *arc_mmu_mumbojumbo(int cpu_id, char *buf, int len) int n = 0; struct cpuinfo_arc_mmu *p_mmu = &cpuinfo_arc700[cpu_id].mmu; - n += scnprintf(buf + n, len - n, "ARC700 MMU [v%x]\t: %dk PAGE, ", - p_mmu->ver, TO_KB(p_mmu->pg_sz)); - n += scnprintf(buf + n, len - n, - "J-TLB %d (%dx%d), uDTLB %d, uITLB %d, %s\n", + "MMU [v%x]\t: %dk PAGE, JTLB %d (%dx%d), uDTLB %d, uITLB %d %s\n", + p_mmu->ver, TO_KB(p_mmu->pg_sz), p_mmu->num_tlb, p_mmu->sets, p_mmu->ways, p_mmu->u_dtlb, p_mmu->u_itlb, - IS_ENABLED(CONFIG_ARC_MMU_SASID) ? "SASID" : ""); + IS_ENABLED(CONFIG_ARC_MMU_SASID) ? ",SASID" : ""); return buf; } -- cgit v1.2.3 From 60779143b5451095b4bbfb021d39955cb4794913 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 13 Oct 2014 11:34:13 -0400 Subject: Revert "drm/radeon: drop btc_get_max_clock_from_voltage_dependency_table" This reverts commit fc9dfeb1383287631ad5c5a676a2558b799db6e9. There are still some stability problems on some SI boards so bring this back. --- drivers/gpu/drm/radeon/btc_dpm.c | 17 +++++++++++++++++ drivers/gpu/drm/radeon/btc_dpm.h | 2 ++ 2 files changed, 19 insertions(+) diff --git a/drivers/gpu/drm/radeon/btc_dpm.c b/drivers/gpu/drm/radeon/btc_dpm.c index 300d971187c4..b7181607e421 100644 --- a/drivers/gpu/drm/radeon/btc_dpm.c +++ b/drivers/gpu/drm/radeon/btc_dpm.c @@ -1170,6 +1170,23 @@ static const struct radeon_blacklist_clocks btc_blacklist_clocks[] = { 25000, 30000, RADEON_SCLK_UP } }; +void btc_get_max_clock_from_voltage_dependency_table(struct radeon_clock_voltage_dependency_table *table, + u32 *max_clock) +{ + u32 i, clock = 0; + + if ((table == NULL) || (table->count == 0)) { + *max_clock = clock; + return; + } + + for (i = 0; i < table->count; i++) { + if (clock < table->entries[i].clk) + clock = table->entries[i].clk; + } + *max_clock = clock; +} + void btc_apply_voltage_dependency_rules(struct radeon_clock_voltage_dependency_table *table, u32 clock, u16 max_voltage, u16 *voltage) { diff --git a/drivers/gpu/drm/radeon/btc_dpm.h b/drivers/gpu/drm/radeon/btc_dpm.h index 1a15e0e41950..3b6f12b7760b 100644 --- a/drivers/gpu/drm/radeon/btc_dpm.h +++ b/drivers/gpu/drm/radeon/btc_dpm.h @@ -46,6 +46,8 @@ void btc_adjust_clock_combinations(struct radeon_device *rdev, struct rv7xx_pl *pl); void btc_apply_voltage_dependency_rules(struct radeon_clock_voltage_dependency_table *table, u32 clock, u16 max_voltage, u16 *voltage); +void btc_get_max_clock_from_voltage_dependency_table(struct radeon_clock_voltage_dependency_table *table, + u32 *max_clock); void btc_apply_voltage_delta_rules(struct radeon_device *rdev, u16 max_vddc, u16 max_vddci, u16 *vddc, u16 *vddci); -- cgit v1.2.3 From 1db7802418596880b51d78408f10f25e6fbd8656 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 13 Oct 2014 11:35:06 -0400 Subject: Revert "drm/radeon/dpm: drop clk/voltage dependency filters for SI" This reverts commit 186b1b2ba2a0684e3d2d3703427a993a3b35b16d. There are still some stability problems on some SI boards so bring this back. --- drivers/gpu/drm/radeon/si_dpm.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index 9e4d5d7d348f..70e61ffeace2 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -2916,6 +2916,7 @@ static void si_apply_state_adjust_rules(struct radeon_device *rdev, bool disable_sclk_switching = false; u32 mclk, sclk; u16 vddc, vddci; + u32 max_sclk_vddc, max_mclk_vddci, max_mclk_vddc; int i; if ((rdev->pm.dpm.new_active_crtc_count > 1) || @@ -2949,6 +2950,29 @@ static void si_apply_state_adjust_rules(struct radeon_device *rdev, } } + /* limit clocks to max supported clocks based on voltage dependency tables */ + btc_get_max_clock_from_voltage_dependency_table(&rdev->pm.dpm.dyn_state.vddc_dependency_on_sclk, + &max_sclk_vddc); + btc_get_max_clock_from_voltage_dependency_table(&rdev->pm.dpm.dyn_state.vddci_dependency_on_mclk, + &max_mclk_vddci); + btc_get_max_clock_from_voltage_dependency_table(&rdev->pm.dpm.dyn_state.vddc_dependency_on_mclk, + &max_mclk_vddc); + + for (i = 0; i < ps->performance_level_count; i++) { + if (max_sclk_vddc) { + if (ps->performance_levels[i].sclk > max_sclk_vddc) + ps->performance_levels[i].sclk = max_sclk_vddc; + } + if (max_mclk_vddci) { + if (ps->performance_levels[i].mclk > max_mclk_vddci) + ps->performance_levels[i].mclk = max_mclk_vddci; + } + if (max_mclk_vddc) { + if (ps->performance_levels[i].mclk > max_mclk_vddc) + ps->performance_levels[i].mclk = max_mclk_vddc; + } + } + /* XXX validate the min clocks required for display */ if (disable_mclk_switching) { -- cgit v1.2.3 From 813d32f91333e4c33d5a19b67167c4bae42dae75 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 14 Oct 2014 02:35:49 -0400 Subject: ext4: check s_chksum_driver when looking for bg csum presence Convert the ext4_has_group_desc_csum predicate to look for a checksum driver instead of the metadata_csum flag and change the bg checksum calculation function to look for GDT_CSUM before taking the crc16 path. Without this patch, if we mount with ^uninit_bg,^metadata_csum and later metadata_csum gets turned on by accident, the block group checksum functions will incorrectly assume that checksumming is enabled (metadata_csum) but that crc16 should be used (!s_chksum_driver). This is totally wrong, so fix the predicate and the checksum formula selection. (Granted, if the metadata_csum feature bit gets enabled on a live FS then something underhanded is going on, but we could at least avoid writing garbage into the on-disk fields.) Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Reviewed-by: Dmitry Monakhov Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 4 ++-- fs/ext4/super.c | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 1483d9c6061f..c55a1faaed58 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2333,8 +2333,8 @@ extern int ext4_register_li_request(struct super_block *sb, static inline int ext4_has_group_desc_csum(struct super_block *sb) { return EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM | - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM); + EXT4_FEATURE_RO_COMPAT_GDT_CSUM) || + (EXT4_SB(sb)->s_chksum_driver != NULL); } static inline int ext4_has_metadata_csum(struct super_block *sb) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5afe42db303c..e96b6ecc8d34 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2005,6 +2005,10 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, } /* old crc16 code */ + if (!(sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM))) + return 0; + offset = offsetof(struct ext4_group_desc, bg_checksum); crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); -- cgit v1.2.3 From bfe9a2cfe91a1c920f152ce5fd0a9ad74b3daf12 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Oct 2014 19:41:38 +1100 Subject: powerpc: Reimplement __get_SP() as a function not a define Li Zhong points out an issue with our current __get_SP() implementation. If ftrace function tracing is enabled (ie -pg profiling using _mcount) we spill a stack frame on 64bit all the time. If a function calls __get_SP() and later calls a function that is tail call optimised, we will pop the stack frame and the value returned by __get_SP() is no longer valid. An example from Li can be found in save_stack_trace -> save_context_stack: c0000000000432c0 <.save_stack_trace>: c0000000000432c0: mflr r0 c0000000000432c4: std r0,16(r1) c0000000000432c8: stdu r1,-128(r1) <-- stack frame for _mcount c0000000000432cc: std r3,112(r1) c0000000000432d0: bl <._mcount> c0000000000432d4: nop c0000000000432d8: mr r4,r1 <-- __get_SP() c0000000000432dc: ld r5,632(r13) c0000000000432e0: ld r3,112(r1) c0000000000432e4: li r6,1 c0000000000432e8: addi r1,r1,128 <-- pop stack frame c0000000000432ec: ld r0,16(r1) c0000000000432f0: mtlr r0 c0000000000432f4: b <.save_context_stack> <-- tail call optimized save_context_stack ends up with a stack pointer below the current one, and it is likely to be scribbled over. Fix this by making __get_SP() a function which returns the callers stack frame. Also replace inline assembly which grabs the stack pointer in save_stack_trace and show_stack with __get_SP(). This also fixes an issue with perf_arch_fetch_caller_regs(). It currently unwinds the stack once, which will skip a valid stack frame on a leaf function. With the __get_SP() fixes in this patch, we never need to unwind the stack frame to get to the first interesting frame. We have to export __get_SP() because perf_arch_fetch_caller_regs() (which is used in modules) calls it from a header file. Reported-by: Li Zhong Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/perf_event.h | 2 +- arch/powerpc/include/asm/reg.h | 3 +-- arch/powerpc/kernel/misc.S | 4 ++++ arch/powerpc/kernel/ppc_ksyms.c | 2 ++ arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/stacktrace.c | 2 +- 6 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h index 0bb23725b1e7..b058568a850d 100644 --- a/arch/powerpc/include/asm/perf_event.h +++ b/arch/powerpc/include/asm/perf_event.h @@ -34,7 +34,7 @@ do { \ (regs)->result = 0; \ (regs)->nip = __ip; \ - (regs)->gpr[1] = *(unsigned long *)__get_SP(); \ + (regs)->gpr[1] = __get_SP(); \ asm volatile("mfmsr %0" : "=r" ((regs)->msr)); \ } while (0) #endif diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index fe3f9488f321..e539d7e71451 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1265,8 +1265,7 @@ static inline unsigned long mfvtb (void) #define proc_trap() asm volatile("trap") -#define __get_SP() ({unsigned long sp; \ - asm volatile("mr %0,1": "=r" (sp)); sp;}) +extern unsigned long __get_SP(void); extern unsigned long scom970_read(unsigned int address); extern void scom970_write(unsigned int address, unsigned long value); diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 7ce26d45777e..120deb713bc8 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -114,3 +114,7 @@ _GLOBAL(longjmp) mtlr r0 mr r3,r4 blr + +_GLOBAL(__get_SP) + PPC_LL r3,0(r1) + blr diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index c4dfff6c2719..9d84efbd7b7a 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -41,3 +41,5 @@ EXPORT_SYMBOL(giveup_spe); #ifdef CONFIG_EPAPR_PARAVIRT EXPORT_SYMBOL(epapr_hypercall_start); #endif + +EXPORT_SYMBOL(__get_SP); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index aa1df89c8b2a..3cc643988101 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1545,7 +1545,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) tsk = current; if (sp == 0) { if (tsk == current) - asm("mr %0,1" : "=r" (sp)); + sp = __get_SP(); else sp = tsk->thread.ksp; } diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 3d30ef1038e5..7f65baec29f6 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -50,7 +50,7 @@ void save_stack_trace(struct stack_trace *trace) { unsigned long sp; - asm("mr %0,1" : "=r" (sp)); + sp = __get_SP(); save_context_stack(trace, sp, current, 1); } -- cgit v1.2.3 From acf620ecf56cfc4edaffaf158250e128539cdd26 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Oct 2014 19:41:39 +1100 Subject: powerpc: Rename __get_SP() to current_stack_pointer() Michael points out that __get_SP() is a pretty horrible function name. Let's give it a better name. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/perf_event.h | 2 +- arch/powerpc/include/asm/reg.h | 2 +- arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/misc.S | 2 +- arch/powerpc/kernel/ppc_ksyms.c | 2 +- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/stacktrace.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h index b058568a850d..8bf1b6351716 100644 --- a/arch/powerpc/include/asm/perf_event.h +++ b/arch/powerpc/include/asm/perf_event.h @@ -34,7 +34,7 @@ do { \ (regs)->result = 0; \ (regs)->nip = __ip; \ - (regs)->gpr[1] = __get_SP(); \ + (regs)->gpr[1] = current_stack_pointer(); \ asm volatile("mfmsr %0" : "=r" ((regs)->msr)); \ } while (0) #endif diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index e539d7e71451..c998279bd85b 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1265,7 +1265,7 @@ static inline unsigned long mfvtb (void) #define proc_trap() asm volatile("trap") -extern unsigned long __get_SP(void); +extern unsigned long current_stack_pointer(void); extern unsigned long scom970_read(unsigned int address); extern void scom970_write(unsigned int address, unsigned long value); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 8eb857f216c1..c14383575fe8 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -466,7 +466,7 @@ static inline void check_stack_overflow(void) #ifdef CONFIG_DEBUG_STACKOVERFLOW long sp; - sp = __get_SP() & (THREAD_SIZE-1); + sp = current_stack_pointer() & (THREAD_SIZE-1); /* check for stack overflow: is there less than 2KB free? */ if (unlikely(sp < (sizeof(struct thread_info) + 2048))) { diff --git a/arch/powerpc/kernel/misc.S b/arch/powerpc/kernel/misc.S index 120deb713bc8..0d432194c018 100644 --- a/arch/powerpc/kernel/misc.S +++ b/arch/powerpc/kernel/misc.S @@ -115,6 +115,6 @@ _GLOBAL(longjmp) mr r3,r4 blr -_GLOBAL(__get_SP) +_GLOBAL(current_stack_pointer) PPC_LL r3,0(r1) blr diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c index 9d84efbd7b7a..202963ee013a 100644 --- a/arch/powerpc/kernel/ppc_ksyms.c +++ b/arch/powerpc/kernel/ppc_ksyms.c @@ -42,4 +42,4 @@ EXPORT_SYMBOL(giveup_spe); EXPORT_SYMBOL(epapr_hypercall_start); #endif -EXPORT_SYMBOL(__get_SP); +EXPORT_SYMBOL(current_stack_pointer); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 3cc643988101..923cd2daba89 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1545,7 +1545,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack) tsk = current; if (sp == 0) { if (tsk == current) - sp = __get_SP(); + sp = current_stack_pointer(); else sp = tsk->thread.ksp; } diff --git a/arch/powerpc/kernel/stacktrace.c b/arch/powerpc/kernel/stacktrace.c index 7f65baec29f6..ea43a347a104 100644 --- a/arch/powerpc/kernel/stacktrace.c +++ b/arch/powerpc/kernel/stacktrace.c @@ -50,7 +50,7 @@ void save_stack_trace(struct stack_trace *trace) { unsigned long sp; - sp = __get_SP(); + sp = current_stack_pointer(); save_context_stack(trace, sp, current, 1); } -- cgit v1.2.3 From 4ff52b4dedcd4226cf1f2817eb14e8d733790eae Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Oct 2014 19:41:40 +1100 Subject: powerpc/pseries: Use dump_stack instead of show_stack We can use the simpler dump_stack() instead of show_stack(current, __get_SP()) Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/iommu.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index de1ec54a2a57..e32e00976a94 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -30,7 +30,6 @@ #include #include #include -#include /* for show_stack */ #include #include #include @@ -168,7 +167,7 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, printk("\tindex = 0x%llx\n", (u64)tbl->it_index); printk("\ttcenum = 0x%llx\n", (u64)tcenum); printk("\ttce val = 0x%llx\n", tce ); - show_stack(current, (unsigned long *)__get_SP()); + dump_stack(); } tcenum++; @@ -257,7 +256,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, printk("\tindex = 0x%llx\n", (u64)tbl->it_index); printk("\tnpages = 0x%llx\n", (u64)npages); printk("\ttce[0] val = 0x%llx\n", tcep[0]); - show_stack(current, (unsigned long *)__get_SP()); + dump_stack(); } return ret; } @@ -273,7 +272,7 @@ static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); printk("\tindex = 0x%llx\n", (u64)tbl->it_index); printk("\ttcenum = 0x%llx\n", (u64)tcenum); - show_stack(current, (unsigned long *)__get_SP()); + dump_stack(); } tcenum++; @@ -292,7 +291,7 @@ static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long n printk("\trc = %lld\n", rc); printk("\tindex = 0x%llx\n", (u64)tbl->it_index); printk("\tnpages = 0x%llx\n", (u64)npages); - show_stack(current, (unsigned long *)__get_SP()); + dump_stack(); } } @@ -307,7 +306,7 @@ static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc); printk("\tindex = 0x%llx\n", (u64)tbl->it_index); printk("\ttcenum = 0x%llx\n", (u64)tcenum); - show_stack(current, (unsigned long *)__get_SP()); + dump_stack(); } return tce_ret; -- cgit v1.2.3 From d6f1e7abdb95a7ea031e7604829e4b5514d7e2c1 Mon Sep 17 00:00:00 2001 From: Bharata B Rao Date: Tue, 16 Sep 2014 15:15:45 -0500 Subject: powerpc/pseries: Make CPU hotplug path endian safe - ibm,rtas-configure-connector should treat the RTAS data as big endian. - Treat ibm,ppc-interrupt-server#s as big-endian when setting smp_processor_id during hotplug. Signed-off-by: Bharata B Rao Signed-off-by: Thomas Falcon Acked-by: Nathan Fontenot Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/pseries/dlpar.c | 22 +++++++++++----------- arch/powerpc/platforms/pseries/hotplug-cpu.c | 4 ++-- arch/powerpc/platforms/pseries/pseries.h | 3 ++- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index fdf01b660d59..6ad83bd11fe2 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -25,11 +25,11 @@ #include struct cc_workarea { - u32 drc_index; - u32 zero; - u32 name_offset; - u32 prop_length; - u32 prop_offset; + __be32 drc_index; + __be32 zero; + __be32 name_offset; + __be32 prop_length; + __be32 prop_offset; }; void dlpar_free_cc_property(struct property *prop) @@ -49,11 +49,11 @@ static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa) if (!prop) return NULL; - name = (char *)ccwa + ccwa->name_offset; + name = (char *)ccwa + be32_to_cpu(ccwa->name_offset); prop->name = kstrdup(name, GFP_KERNEL); - prop->length = ccwa->prop_length; - value = (char *)ccwa + ccwa->prop_offset; + prop->length = be32_to_cpu(ccwa->prop_length); + value = (char *)ccwa + be32_to_cpu(ccwa->prop_offset); prop->value = kmemdup(value, prop->length, GFP_KERNEL); if (!prop->value) { dlpar_free_cc_property(prop); @@ -79,7 +79,7 @@ static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa, if (!dn) return NULL; - name = (char *)ccwa + ccwa->name_offset; + name = (char *)ccwa + be32_to_cpu(ccwa->name_offset); dn->full_name = kasprintf(GFP_KERNEL, "%s/%s", path, name); if (!dn->full_name) { kfree(dn); @@ -126,7 +126,7 @@ void dlpar_free_cc_nodes(struct device_node *dn) #define CALL_AGAIN -2 #define ERR_CFG_USE -9003 -struct device_node *dlpar_configure_connector(u32 drc_index, +struct device_node *dlpar_configure_connector(__be32 drc_index, struct device_node *parent) { struct device_node *dn; @@ -414,7 +414,7 @@ static ssize_t dlpar_cpu_probe(const char *buf, size_t count) if (!parent) return -ENODEV; - dn = dlpar_configure_connector(drc_index, parent); + dn = dlpar_configure_connector(cpu_to_be32(drc_index), parent); if (!dn) return -EINVAL; diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c index b174fa751d26..5c375f93c669 100644 --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -247,7 +247,7 @@ static int pseries_add_processor(struct device_node *np) unsigned int cpu; cpumask_var_t candidate_mask, tmp; int err = -ENOSPC, len, nthreads, i; - const u32 *intserv; + const __be32 *intserv; intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len); if (!intserv) @@ -293,7 +293,7 @@ static int pseries_add_processor(struct device_node *np) for_each_cpu(cpu, tmp) { BUG_ON(cpu_present(cpu)); set_cpu_present(cpu, true); - set_hard_smp_processor_id(cpu, *intserv++); + set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++)); } err = 0; out_unlock: diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h index 361add62abf1..1796c5438cc6 100644 --- a/arch/powerpc/platforms/pseries/pseries.h +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -56,7 +56,8 @@ extern void hvc_vio_init_early(void); /* Dynamic logical Partitioning/Mobility */ extern void dlpar_free_cc_nodes(struct device_node *); extern void dlpar_free_cc_property(struct property *); -extern struct device_node *dlpar_configure_connector(u32, struct device_node *); +extern struct device_node *dlpar_configure_connector(__be32, + struct device_node *); extern int dlpar_attach_node(struct device_node *); extern int dlpar_detach_node(struct device_node *); -- cgit v1.2.3 From 8315070c07e7ef5f58ce9e317dc91fd727ecd419 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 1 Oct 2014 17:07:49 +1000 Subject: powerpc/eeh: Fix condition for isolated state Function eeh_pe_state_mark() could possibly have combination of multiple EEH PE state as its argument. The patch fixes the condition used to check if EEH_PE_ISOLATED is included. Signed-off-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh_pe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 53dd0915e690..37f21284809c 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -525,7 +525,7 @@ static void *__eeh_pe_state_mark(void *data, void *flag) pe->state |= state; /* Offline PCI devices if applicable */ - if (state != EEH_PE_ISOLATED) + if (!(state & EEH_PE_ISOLATED)) return NULL; eeh_pe_for_each_dev(pe, edev, tmp) { -- cgit v1.2.3 From 8a6b3710ccc33da1fd5c85144ad3db01c4457552 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 1 Oct 2014 17:07:50 +1000 Subject: powerpc/eeh: Rename flag EEH_PE_RESET to EEH_PE_CFG_BLOCKED The flag EEH_PE_RESET indicates blocking config space of the PE during reset time. We potentially need block PE's config space other than reset time. So it's reasonable to replace it with EEH_PE_CFG_BLOCKED to indicate its usage. There are no substantial code or logic changes in this patch. Signed-off-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 2 +- arch/powerpc/kernel/eeh.c | 12 ++++++------ arch/powerpc/kernel/eeh_driver.c | 12 ++++++------ arch/powerpc/kernel/rtas_pci.c | 4 ++-- arch/powerpc/platforms/powernv/eeh-ioda.c | 2 +- arch/powerpc/platforms/powernv/pci.c | 2 +- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 3b260efbfbf9..6a2ad90e6d8c 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -71,7 +71,7 @@ struct device_node; #define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */ #define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */ -#define EEH_PE_RESET (1 << 2) /* PE reset in progress */ +#define EEH_PE_CFG_BLOCKED (1 << 2) /* Block config access */ #define EEH_PE_KEEP (1 << 8) /* Keep PE on hotplug */ diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index d543e4179c18..4d83f50cc614 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -673,18 +673,18 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat switch (state) { case pcie_deassert_reset: eeh_ops->reset(pe, EEH_RESET_DEACTIVATE); - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); break; case pcie_hot_reset: - eeh_pe_state_mark(pe, EEH_PE_RESET); + eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED); eeh_ops->reset(pe, EEH_RESET_HOT); break; case pcie_warm_reset: - eeh_pe_state_mark(pe, EEH_PE_RESET); + eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED); eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL); break; default: - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); return -EINVAL; }; @@ -1523,7 +1523,7 @@ int eeh_pe_reset(struct eeh_pe *pe, int option) switch (option) { case EEH_RESET_DEACTIVATE: ret = eeh_ops->reset(pe, option); - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); if (ret) break; @@ -1538,7 +1538,7 @@ int eeh_pe_reset(struct eeh_pe *pe, int option) */ eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE); - eeh_pe_state_mark(pe, EEH_PE_RESET); + eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED); ret = eeh_ops->reset(pe, option); break; default: diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 3fd514f8e4b2..6535936bdf27 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -528,13 +528,13 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe) eeh_pe_dev_traverse(pe, eeh_report_error, &result); /* Issue reset */ - eeh_pe_state_mark(pe, EEH_PE_RESET); + eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED); ret = eeh_reset_pe(pe); if (ret) { - eeh_pe_state_clear(pe, EEH_PE_RECOVERING | EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_RECOVERING | EEH_PE_CFG_BLOCKED); return ret; } - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); /* Unfreeze the PE */ ret = eeh_clear_pe_frozen_state(pe, true); @@ -601,10 +601,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) * config accesses. So we prefer to block them. However, controlled * PCI config accesses initiated from EEH itself are allowed. */ - eeh_pe_state_mark(pe, EEH_PE_RESET); + eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED); rc = eeh_reset_pe(pe); if (rc) { - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); return rc; } @@ -613,7 +613,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) /* Restore PE */ eeh_ops->configure_bridge(pe); eeh_pe_restore_bars(pe); - eeh_pe_state_clear(pe, EEH_PE_RESET); + eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED); /* Clear frozen state */ rc = eeh_clear_pe_frozen_state(pe, false); diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index c168337aef9d..ce7c8b6a8602 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -111,7 +111,7 @@ static int rtas_pci_read_config(struct pci_bus *bus, return PCIBIOS_DEVICE_NOT_FOUND; #ifdef CONFIG_EEH edev = of_node_to_eeh_dev(dn); - if (edev && edev->pe && edev->pe->state & EEH_PE_RESET) + if (edev && edev->pe && edev->pe->state & EEH_PE_CFG_BLOCKED) return PCIBIOS_DEVICE_NOT_FOUND; #endif @@ -175,7 +175,7 @@ static int rtas_pci_write_config(struct pci_bus *bus, return PCIBIOS_DEVICE_NOT_FOUND; #ifdef CONFIG_EEH edev = of_node_to_eeh_dev(dn); - if (edev && edev->pe && (edev->pe->state & EEH_PE_RESET)) + if (edev && edev->pe && (edev->pe->state & EEH_PE_CFG_BLOCKED)) return PCIBIOS_DEVICE_NOT_FOUND; #endif ret = rtas_write_config(pdn, where, size, val); diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index 426814a2ede3..eba9cb10619c 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -373,7 +373,7 @@ static int ioda_eeh_get_pe_state(struct eeh_pe *pe) * moving forward, we have to return operational * state during PE reset. */ - if (pe->state & EEH_PE_RESET) { + if (pe->state & EEH_PE_CFG_BLOCKED) { result = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE | EEH_STATE_MMIO_ENABLED | diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index e9f509bbc078..b1b7ac2d494f 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -513,7 +513,7 @@ static bool pnv_pci_cfg_check(struct pci_controller *hose, edev = of_node_to_eeh_dev(dn); if (edev) { if (edev->pe && - (edev->pe->state & EEH_PE_RESET)) + (edev->pe->state & EEH_PE_CFG_BLOCKED)) return false; if (edev->mode & EEH_DEV_REMOVED) -- cgit v1.2.3 From d2cfbcd7c8136bc4d944eb64f7f0479dd2b129b8 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 1 Oct 2014 17:07:51 +1000 Subject: powerpc/powernv: Drop config requests in EEH accessors It's bad idea to access the PCI config registers of the adapters, which is experiencing reset. It leads to recursive EEH error without exception. The patch drops PCI config requests in EEH accessors if the PE has been marked to accept PCI config requests, for example during PE reseet time. Signed-off-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/eeh-powernv.c | 37 ++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 3e89cbf55885..04e42f78a37a 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -383,6 +383,39 @@ static int powernv_eeh_err_inject(struct eeh_pe *pe, int type, int func, return ret; } +static inline bool powernv_eeh_cfg_blocked(struct device_node *dn) +{ + struct eeh_dev *edev = of_node_to_eeh_dev(dn); + + if (!edev || !edev->pe) + return false; + + if (edev->pe->state & EEH_PE_CFG_BLOCKED) + return true; + + return false; +} + +static int powernv_eeh_read_config(struct device_node *dn, + int where, int size, u32 *val) +{ + if (powernv_eeh_cfg_blocked(dn)) { + *val = 0xFFFFFFFF; + return PCIBIOS_SET_FAILED; + } + + return pnv_pci_cfg_read(dn, where, size, val); +} + +static int powernv_eeh_write_config(struct device_node *dn, + int where, int size, u32 val) +{ + if (powernv_eeh_cfg_blocked(dn)) + return PCIBIOS_SET_FAILED; + + return pnv_pci_cfg_write(dn, where, size, val); +} + /** * powernv_eeh_next_error - Retrieve next EEH error to handle * @pe: Affected PE @@ -440,8 +473,8 @@ static struct eeh_ops powernv_eeh_ops = { .get_log = powernv_eeh_get_log, .configure_bridge = powernv_eeh_configure_bridge, .err_inject = powernv_eeh_err_inject, - .read_config = pnv_pci_cfg_read, - .write_config = pnv_pci_cfg_write, + .read_config = powernv_eeh_read_config, + .write_config = powernv_eeh_write_config, .next_error = powernv_eeh_next_error, .restore_config = powernv_eeh_restore_config }; -- cgit v1.2.3 From 3409eb4e69e1150202bc4ec61801115da32aa380 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 1 Oct 2014 17:07:52 +1000 Subject: powerpc/pseries: Drop config requests in EEH accessors The pSeires EEH config accessors rely on rtas_{read, write}_config() and the condition to check if the PE's config space is blocked should be moved to those 2 functions so that config requests from kernel, userland, EEH core can be dropped to avoid recursive EEH error if necessary. Signed-off-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/rtas_pci.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/rtas_pci.c b/arch/powerpc/kernel/rtas_pci.c index ce7c8b6a8602..7c55b86206b3 100644 --- a/arch/powerpc/kernel/rtas_pci.c +++ b/arch/powerpc/kernel/rtas_pci.c @@ -66,6 +66,11 @@ int rtas_read_config(struct pci_dn *pdn, int where, int size, u32 *val) return PCIBIOS_DEVICE_NOT_FOUND; if (!config_access_valid(pdn, where)) return PCIBIOS_BAD_REGISTER_NUMBER; +#ifdef CONFIG_EEH + if (pdn->edev && pdn->edev->pe && + (pdn->edev->pe->state & EEH_PE_CFG_BLOCKED)) + return PCIBIOS_SET_FAILED; +#endif addr = rtas_config_addr(pdn->busno, pdn->devfn, where); buid = pdn->phb->buid; @@ -90,9 +95,6 @@ static int rtas_pci_read_config(struct pci_bus *bus, struct device_node *busdn, *dn; struct pci_dn *pdn; bool found = false; -#ifdef CONFIG_EEH - struct eeh_dev *edev; -#endif int ret; /* Search only direct children of the bus */ @@ -109,11 +111,6 @@ static int rtas_pci_read_config(struct pci_bus *bus, if (!found) return PCIBIOS_DEVICE_NOT_FOUND; -#ifdef CONFIG_EEH - edev = of_node_to_eeh_dev(dn); - if (edev && edev->pe && edev->pe->state & EEH_PE_CFG_BLOCKED) - return PCIBIOS_DEVICE_NOT_FOUND; -#endif ret = rtas_read_config(pdn, where, size, val); if (*val == EEH_IO_ERROR_VALUE(size) && @@ -132,6 +129,11 @@ int rtas_write_config(struct pci_dn *pdn, int where, int size, u32 val) return PCIBIOS_DEVICE_NOT_FOUND; if (!config_access_valid(pdn, where)) return PCIBIOS_BAD_REGISTER_NUMBER; +#ifdef CONFIG_EEH + if (pdn->edev && pdn->edev->pe && + (pdn->edev->pe->state & EEH_PE_CFG_BLOCKED)) + return PCIBIOS_SET_FAILED; +#endif addr = rtas_config_addr(pdn->busno, pdn->devfn, where); buid = pdn->phb->buid; @@ -155,10 +157,6 @@ static int rtas_pci_write_config(struct pci_bus *bus, struct device_node *busdn, *dn; struct pci_dn *pdn; bool found = false; -#ifdef CONFIG_EEH - struct eeh_dev *edev; -#endif - int ret; /* Search only direct children of the bus */ busdn = pci_bus_to_OF_node(bus); @@ -173,14 +171,8 @@ static int rtas_pci_write_config(struct pci_bus *bus, if (!found) return PCIBIOS_DEVICE_NOT_FOUND; -#ifdef CONFIG_EEH - edev = of_node_to_eeh_dev(dn); - if (edev && edev->pe && (edev->pe->state & EEH_PE_CFG_BLOCKED)) - return PCIBIOS_DEVICE_NOT_FOUND; -#endif - ret = rtas_write_config(pdn, where, size, val); - return ret; + return rtas_write_config(pdn, where, size, val); } static struct pci_ops rtas_pci_ops = { -- cgit v1.2.3 From b6541db1395298b326ae1bf59fae6fbb2c6e8f77 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 1 Oct 2014 17:07:53 +1000 Subject: powerpc/eeh: Block PCI config access upon frozen PE The problem was found when I tried to inject PCI config error by PHB3 PAPR error injection registers into Broadcom Austin 4-ports NIC adapter. The frozen PE was reported successfully and EEH core started to recover it. However, I run into fenced PHB when dumping PCI config space as EEH logs. I was told that PCI config requests should not be progagated to the adapter until PE reset is done successfully. Otherise, we would run out of PHB internal credits and trigger PCT (PCIE Completion Timeout), which leads to the fenced PHB. The patch introduces another PE flag EEH_PE_CFG_RESTRICTED, which is set during PE initialization time if the PE includes the specific PCI devices that need block PCI config access until PE reset is done. When the PE becomes frozen for the first time, EEH_PE_CFG_BLOCKED is set if the PE has flag EEH_PE_CFG_RESTRICTED. Then the PCI config access to the PE will be dropped by platform PCI accessors until PE reset is done successfully. The mechanism is shared by PowerNV platform owned PE or userland owned ones. It's not used on pSeries platform yet. Signed-off-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/eeh.h | 1 + arch/powerpc/kernel/eeh_pe.c | 8 ++++++++ arch/powerpc/platforms/powernv/eeh-powernv.c | 19 +++++++++++++++++++ 3 files changed, 28 insertions(+) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 6a2ad90e6d8c..ca07f9c27335 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -74,6 +74,7 @@ struct device_node; #define EEH_PE_CFG_BLOCKED (1 << 2) /* Block config access */ #define EEH_PE_KEEP (1 << 8) /* Keep PE on hotplug */ +#define EEH_PE_CFG_RESTRICTED (1 << 9) /* Block config on error */ struct eeh_pe { int type; /* PE type: PHB/Bus/Device */ diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c index 37f21284809c..5a63e2b0f65b 100644 --- a/arch/powerpc/kernel/eeh_pe.c +++ b/arch/powerpc/kernel/eeh_pe.c @@ -534,6 +534,10 @@ static void *__eeh_pe_state_mark(void *data, void *flag) pdev->error_state = pci_channel_io_frozen; } + /* Block PCI config access if required */ + if (pe->state & EEH_PE_CFG_RESTRICTED) + pe->state |= EEH_PE_CFG_BLOCKED; + return NULL; } @@ -611,6 +615,10 @@ static void *__eeh_pe_state_clear(void *data, void *flag) pdev->error_state = pci_channel_io_normal; } + /* Unblock PCI config access if required */ + if (pe->state & EEH_PE_CFG_RESTRICTED) + pe->state &= ~EEH_PE_CFG_BLOCKED; + return NULL; } diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 04e42f78a37a..443ce965a5b0 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -168,6 +168,25 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) return ret; } + /* + * If the PE contains any one of following adapters, the + * PCI config space can't be accessed when dumping EEH log. + * Otherwise, we will run into fenced PHB caused by shortage + * of outbound credits in the adapter. The PCI config access + * should be blocked until PE reset. MMIO access is dropped + * by hardware certainly. In order to drop PCI config requests, + * one more flag (EEH_PE_CFG_RESTRICTED) is introduced, which + * will be checked in the backend for PE state retrival. If + * the PE becomes frozen for the first time and the flag has + * been set for the PE, we will set EEH_PE_CFG_BLOCKED for + * that PE to block its config space. + * + * Broadcom Austin 4-ports NICs (14e4:1657) + */ + if (dev->vendor == PCI_VENDOR_ID_BROADCOM && + dev->device == 0x1657) + edev->pe->state |= EEH_PE_CFG_RESTRICTED; + /* * Cache the PE primary bus, which can't be fetched when * full hotplug is in progress. In that case, all child -- cgit v1.2.3 From c59004cc83c3f8b182c32ca9d366d222a59ab63f Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Wed, 1 Oct 2014 17:07:54 +1000 Subject: powerpc/eeh: Don't collect logs on PE with blocked config space When the PE's config space is marked as blocked, PCI config read requests always return 0xFF's. It's pointless to collect logs in this case. Signed-off-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/eeh.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 4d83f50cc614..2248a1999c64 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -257,6 +257,13 @@ static void *eeh_dump_pe_log(void *data, void *flag) struct eeh_dev *edev, *tmp; size_t *plen = flag; + /* If the PE's config space is blocked, 0xFF's will be + * returned. It's pointless to collect the log in this + * case. + */ + if (pe->state & EEH_PE_CFG_BLOCKED) + return NULL; + eeh_pe_for_each_dev(pe, edev, tmp) *plen += eeh_dump_dev_log(edev, pci_regs_buf + *plen, EEH_PCI_REGS_LOG_LEN - *plen); -- cgit v1.2.3 From 179ea48bc7c04dba3526d66d9f358c2f4f3b3776 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 3 Oct 2014 14:58:32 +1000 Subject: powerpc/eeh: Block CFG upon frozen Shiner adapter The Broadcom Shiner 2-ports 10G ethernet adapter has same problem commit 6f20bda0 ("powerpc/eeh: Block PCI config access upon frozen PE") fixes. Put it to the black list as well. # lspci -s 0004:01:00.0 0004:01:00.0 Ethernet controller: Broadcom Corporation \ NetXtreme II BCM57810 10 Gigabit Ethernet (rev 10) # lspci -n -s 0004:01:00.0 0004:01:00.0 0200: 14e4:168e (rev 10) Reported-by: John Walthour Signed-off-by: Gavin Shan Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/eeh-powernv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c index 443ce965a5b0..1d19e7917d7f 100644 --- a/arch/powerpc/platforms/powernv/eeh-powernv.c +++ b/arch/powerpc/platforms/powernv/eeh-powernv.c @@ -182,9 +182,10 @@ static int powernv_eeh_dev_probe(struct pci_dev *dev, void *flag) * that PE to block its config space. * * Broadcom Austin 4-ports NICs (14e4:1657) + * Broadcom Shiner 2-ports 10G NICs (14e4:168e) */ - if (dev->vendor == PCI_VENDOR_ID_BROADCOM && - dev->device == 0x1657) + if ((dev->vendor == PCI_VENDOR_ID_BROADCOM && dev->device == 0x1657) || + (dev->vendor == PCI_VENDOR_ID_BROADCOM && dev->device == 0x168e)) edev->pe->state |= EEH_PE_CFG_RESTRICTED; /* -- cgit v1.2.3 From 695911fb1f0e00aebe6c5636b9c08bf0fd51a2fd Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 10 Oct 2014 19:04:24 +1100 Subject: powerpc/msi: Fix the msi bitmap alignment tests When we added the alignment tests recently we failed to check they were actually passing - oops. They weren't passing, because the bitmap was full. We should also be a bit more careful when checking the return code, a negative error return could by divisible by our alignment value. Fixes: b0345bbc6d09 ("powerpc/msi: Improve IRQ bitmap allocator") Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/msi_bitmap.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index 0c75214b6f92..8155d93dee1d 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -151,7 +151,7 @@ void msi_bitmap_free(struct msi_bitmap *bmp) static void __init test_basics(void) { struct msi_bitmap bmp; - int i, size = 512; + int rc, i, size = 512; /* Can't allocate a bitmap of 0 irqs */ check(msi_bitmap_alloc(&bmp, 0, NULL) != 0); @@ -185,14 +185,24 @@ static void __init test_basics(void) msi_bitmap_free_hwirqs(&bmp, size / 2, 1); check(msi_bitmap_alloc_hwirqs(&bmp, 1) == size / 2); + /* Free most of them for the alignment tests */ + msi_bitmap_free_hwirqs(&bmp, 3, size - 3); + /* Check we get a naturally aligned offset */ - check(msi_bitmap_alloc_hwirqs(&bmp, 2) % 2 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 4) % 4 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 8) % 8 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 9) % 16 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 3) % 4 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 7) % 8 == 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 121) % 128 == 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 2); + check(rc >= 0 && rc % 2 == 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 4); + check(rc >= 0 && rc % 4 == 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 8); + check(rc >= 0 && rc % 8 == 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 9); + check(rc >= 0 && rc % 16 == 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 3); + check(rc >= 0 && rc % 4 == 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 7); + check(rc >= 0 && rc % 8 == 0); + rc = msi_bitmap_alloc_hwirqs(&bmp, 121); + check(rc >= 0 && rc % 128 == 0); msi_bitmap_free(&bmp); -- cgit v1.2.3 From 4a77f2bdbdef289a02bd02fac483a9350e039705 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 10 Oct 2014 19:04:25 +1100 Subject: powerpc/msi: Use WARN_ON() in msi bitmap selftests As demonstrated in the previous commit, the failure message from the msi bitmap selftests is a bit subtle, it's easy to miss a failure in a busy boot log. So drop our check() macro and use WARN_ON() instead. This necessitates inverting all the conditions as well. Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/msi_bitmap.c | 54 ++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/sysdev/msi_bitmap.c b/arch/powerpc/sysdev/msi_bitmap.c index 8155d93dee1d..73b64c73505b 100644 --- a/arch/powerpc/sysdev/msi_bitmap.c +++ b/arch/powerpc/sysdev/msi_bitmap.c @@ -145,69 +145,64 @@ void msi_bitmap_free(struct msi_bitmap *bmp) #ifdef CONFIG_MSI_BITMAP_SELFTEST -#define check(x) \ - if (!(x)) printk("msi_bitmap: test failed at line %d\n", __LINE__); - static void __init test_basics(void) { struct msi_bitmap bmp; int rc, i, size = 512; /* Can't allocate a bitmap of 0 irqs */ - check(msi_bitmap_alloc(&bmp, 0, NULL) != 0); + WARN_ON(msi_bitmap_alloc(&bmp, 0, NULL) == 0); /* of_node may be NULL */ - check(0 == msi_bitmap_alloc(&bmp, size, NULL)); + WARN_ON(msi_bitmap_alloc(&bmp, size, NULL)); /* Should all be free by default */ - check(0 == bitmap_find_free_region(bmp.bitmap, size, - get_count_order(size))); + WARN_ON(bitmap_find_free_region(bmp.bitmap, size, get_count_order(size))); bitmap_release_region(bmp.bitmap, 0, get_count_order(size)); /* With no node, there's no msi-available-ranges, so expect > 0 */ - check(msi_bitmap_reserve_dt_hwirqs(&bmp) > 0); + WARN_ON(msi_bitmap_reserve_dt_hwirqs(&bmp) <= 0); /* Should all still be free */ - check(0 == bitmap_find_free_region(bmp.bitmap, size, - get_count_order(size))); + WARN_ON(bitmap_find_free_region(bmp.bitmap, size, get_count_order(size))); bitmap_release_region(bmp.bitmap, 0, get_count_order(size)); /* Check we can fill it up and then no more */ for (i = 0; i < size; i++) - check(msi_bitmap_alloc_hwirqs(&bmp, 1) >= 0); + WARN_ON(msi_bitmap_alloc_hwirqs(&bmp, 1) < 0); - check(msi_bitmap_alloc_hwirqs(&bmp, 1) < 0); + WARN_ON(msi_bitmap_alloc_hwirqs(&bmp, 1) >= 0); /* Should all be allocated */ - check(bitmap_find_free_region(bmp.bitmap, size, 0) < 0); + WARN_ON(bitmap_find_free_region(bmp.bitmap, size, 0) >= 0); /* And if we free one we can then allocate another */ msi_bitmap_free_hwirqs(&bmp, size / 2, 1); - check(msi_bitmap_alloc_hwirqs(&bmp, 1) == size / 2); + WARN_ON(msi_bitmap_alloc_hwirqs(&bmp, 1) != size / 2); /* Free most of them for the alignment tests */ msi_bitmap_free_hwirqs(&bmp, 3, size - 3); /* Check we get a naturally aligned offset */ rc = msi_bitmap_alloc_hwirqs(&bmp, 2); - check(rc >= 0 && rc % 2 == 0); + WARN_ON(rc < 0 && rc % 2 != 0); rc = msi_bitmap_alloc_hwirqs(&bmp, 4); - check(rc >= 0 && rc % 4 == 0); + WARN_ON(rc < 0 && rc % 4 != 0); rc = msi_bitmap_alloc_hwirqs(&bmp, 8); - check(rc >= 0 && rc % 8 == 0); + WARN_ON(rc < 0 && rc % 8 != 0); rc = msi_bitmap_alloc_hwirqs(&bmp, 9); - check(rc >= 0 && rc % 16 == 0); + WARN_ON(rc < 0 && rc % 16 != 0); rc = msi_bitmap_alloc_hwirqs(&bmp, 3); - check(rc >= 0 && rc % 4 == 0); + WARN_ON(rc < 0 && rc % 4 != 0); rc = msi_bitmap_alloc_hwirqs(&bmp, 7); - check(rc >= 0 && rc % 8 == 0); + WARN_ON(rc < 0 && rc % 8 != 0); rc = msi_bitmap_alloc_hwirqs(&bmp, 121); - check(rc >= 0 && rc % 128 == 0); + WARN_ON(rc < 0 && rc % 128 != 0); msi_bitmap_free(&bmp); - /* Clients may check bitmap == NULL for "not-allocated" */ - check(bmp.bitmap == NULL); + /* Clients may WARN_ON bitmap == NULL for "not-allocated" */ + WARN_ON(bmp.bitmap != NULL); kfree(bmp.bitmap); } @@ -229,14 +224,13 @@ static void __init test_of_node(void) of_node_init(&of_node); of_node.full_name = node_name; - check(0 == msi_bitmap_alloc(&bmp, size, &of_node)); + WARN_ON(msi_bitmap_alloc(&bmp, size, &of_node)); /* No msi-available-ranges, so expect > 0 */ - check(msi_bitmap_reserve_dt_hwirqs(&bmp) > 0); + WARN_ON(msi_bitmap_reserve_dt_hwirqs(&bmp) <= 0); /* Should all still be free */ - check(0 == bitmap_find_free_region(bmp.bitmap, size, - get_count_order(size))); + WARN_ON(bitmap_find_free_region(bmp.bitmap, size, get_count_order(size))); bitmap_release_region(bmp.bitmap, 0, get_count_order(size)); /* Now create a fake msi-available-ranges property */ @@ -250,11 +244,11 @@ static void __init test_of_node(void) of_node.properties = ∝ /* msi-available-ranges, so expect == 0 */ - check(msi_bitmap_reserve_dt_hwirqs(&bmp) == 0); + WARN_ON(msi_bitmap_reserve_dt_hwirqs(&bmp)); /* Check we got the expected result */ - check(0 == bitmap_parselist(expected_str, expected, size)); - check(bitmap_equal(expected, bmp.bitmap, size)); + WARN_ON(bitmap_parselist(expected_str, expected, size)); + WARN_ON(!bitmap_equal(expected, bmp.bitmap, size)); msi_bitmap_free(&bmp); kfree(bmp.bitmap); -- cgit v1.2.3 From c9eeb7b813c9525cda34b61dcf4455c52fc58890 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Tue, 14 Oct 2014 17:04:48 +0200 Subject: s390/hmcdrv: Restrict s390 HMC driver to S390 arch This driver is only usable on 64-bit s390 machines. Mark the Kconfig dependencies to that users on other architectures are not prompted for it. Fixes: 8f933b1043e1e5 ("s390/hmcdrv: HMC drive CD/DVD access") Signed-off-by: Josh Boyer Signed-off-by: Martin Schwidefsky --- drivers/s390/char/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/s390/char/Kconfig b/drivers/s390/char/Kconfig index dc24ecfac2d1..db2cb1f8a1b5 100644 --- a/drivers/s390/char/Kconfig +++ b/drivers/s390/char/Kconfig @@ -105,7 +105,7 @@ config SCLP_ASYNC config HMC_DRV def_tristate m prompt "Support for file transfers from HMC drive CD/DVD-ROM" - depends on 64BIT + depends on S390 && 64BIT select CRC16 help This option enables support for file transfers from a Hardware -- cgit v1.2.3 From 66e9bbdb3dbb335b158bb88de2642966af816ffe Mon Sep 17 00:00:00 2001 From: Dominik Dingel Date: Mon, 6 Oct 2014 16:34:44 +0200 Subject: s390/mm: fixing calls of pte_unmap_unlock pte_unmap works on page table entry pointers, derefencing should be avoided. As on s390 pte_unmap is a NOP, this is more a cleanup if we want to supply later such function. Signed-off-by: Dominik Dingel Reviewed-by: Thomas Huth Signed-off-by: Martin Schwidefsky --- arch/s390/mm/pgtable.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 296b61a4af59..1b79ca67392f 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -656,7 +656,7 @@ void __gmap_zap(struct gmap *gmap, unsigned long gaddr) } pgste_set_unlock(ptep, pgste); out_pte: - pte_unmap_unlock(*ptep, ptl); + pte_unmap_unlock(ptep, ptl); } EXPORT_SYMBOL_GPL(__gmap_zap); @@ -943,7 +943,7 @@ retry: } if (!(pte_val(*ptep) & _PAGE_INVALID) && (pte_val(*ptep) & _PAGE_PROTECT)) { - pte_unmap_unlock(*ptep, ptl); + pte_unmap_unlock(ptep, ptl); if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) { up_read(&mm->mmap_sem); return -EFAULT; @@ -974,7 +974,7 @@ retry: pgste_val(new) |= PGSTE_UC_BIT; pgste_set_unlock(ptep, new); - pte_unmap_unlock(*ptep, ptl); + pte_unmap_unlock(ptep, ptl); up_read(&mm->mmap_sem); return 0; } -- cgit v1.2.3 From 5c9fb1899400096c6818181c525897a31d57e488 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Wed, 15 Oct 2014 12:42:58 +0200 Subject: powerpc/vphn: NUMA node code expects big-endian The associativity domain numbers are obtained from the hypervisor through registers and written into memory by the guest: the packed array passed to vphn_unpack_associativity() is then native-endian, unlike what was assumed in the following commit: commit b08a2a12e44eaec5024b2b969f4fcb98169d1ca3 Author: Alistair Popple Date: Wed Aug 7 02:01:44 2013 +1000 powerpc: Make NUMA device node code endian safe This issue fills the topology with bogus data and makes it unusable. It may lead to severe performance breakdowns. We should ideally patch the vphn_unpack_associativity() function to do the 64-bit loads, but this requires some more brain storming. In the meantime, let's go for a suboptimal and temporary bug fix: this patch converts each 64-bit value of the packed array to big endian, as expected by the current parsing code in vphn_unpack_associativity(). Signed-off-by: Greg Kurz Signed-off-by: Michael Ellerman --- arch/powerpc/mm/numa.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 177659050fa0..e5236c24dc07 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1460,8 +1460,11 @@ static long hcall_vphn(unsigned long cpu, __be32 *associativity) long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; u64 flags = 1; int hwcpu = get_hard_smp_processor_id(cpu); + int i; rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); + for (i = 0; i < 6; i++) + retbuf[i] = cpu_to_be64(retbuf[i]); vphn_unpack_associativity(retbuf, associativity); return rc; -- cgit v1.2.3 From 2c186e05a5c6dc8fcfb1e8bf6901ad1598c40db6 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Oct 2014 20:21:22 +1100 Subject: powerpc: Add printk levels to setup_system output Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup_64.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index cd07d79ad21c..4f3cfe1b6a33 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -522,36 +522,36 @@ void __init setup_system(void) smp_release_cpus(); #endif - printk("Starting Linux PPC64 %s\n", init_utsname()->version); + pr_info("Starting Linux PPC64 %s\n", init_utsname()->version); - printk("-----------------------------------------------------\n"); - printk("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); - printk("phys_mem_size = 0x%llx\n", memblock_phys_mem_size()); + pr_info("-----------------------------------------------------\n"); + pr_info("ppc64_pft_size = 0x%llx\n", ppc64_pft_size); + pr_info("phys_mem_size = 0x%llx\n", memblock_phys_mem_size()); if (ppc64_caches.dline_size != 0x80) - printk("dcache_line_size = 0x%x\n", ppc64_caches.dline_size); + pr_info("dcache_line_size = 0x%x\n", ppc64_caches.dline_size); if (ppc64_caches.iline_size != 0x80) - printk("icache_line_size = 0x%x\n", ppc64_caches.iline_size); + pr_info("icache_line_size = 0x%x\n", ppc64_caches.iline_size); - printk("cpu_features = 0x%016lx\n", cur_cpu_spec->cpu_features); - printk(" possible = 0x%016lx\n", CPU_FTRS_POSSIBLE); - printk(" always = 0x%016lx\n", CPU_FTRS_ALWAYS); - printk("cpu_user_features = 0x%08x 0x%08x\n", cur_cpu_spec->cpu_user_features, + pr_info("cpu_features = 0x%016lx\n", cur_cpu_spec->cpu_features); + pr_info(" possible = 0x%016lx\n", CPU_FTRS_POSSIBLE); + pr_info(" always = 0x%016lx\n", CPU_FTRS_ALWAYS); + pr_info("cpu_user_features = 0x%08x 0x%08x\n", cur_cpu_spec->cpu_user_features, cur_cpu_spec->cpu_user_features2); - printk("mmu_features = 0x%08x\n", cur_cpu_spec->mmu_features); - printk("firmware_features = 0x%016lx\n", powerpc_firmware_features); + pr_info("mmu_features = 0x%08x\n", cur_cpu_spec->mmu_features); + pr_info("firmware_features = 0x%016lx\n", powerpc_firmware_features); #ifdef CONFIG_PPC_STD_MMU_64 if (htab_address) - printk("htab_address = 0x%p\n", htab_address); + pr_info("htab_address = 0x%p\n", htab_address); - printk("htab_hash_mask = 0x%lx\n", htab_hash_mask); + pr_info("htab_hash_mask = 0x%lx\n", htab_hash_mask); #endif if (PHYSICAL_START > 0) - printk("physical_start = 0x%llx\n", + pr_info("physical_start = 0x%llx\n", (unsigned long long)PHYSICAL_START); - printk("-----------------------------------------------------\n"); + pr_info("-----------------------------------------------------\n"); DBG(" <- setup_system()\n"); } -- cgit v1.2.3 From 86be175a730bd98de2b75522eae08160ec2dec91 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Mon, 13 Oct 2014 20:17:33 +1100 Subject: powerpc: sync pseries_le_defconfig with pseries_defconfig Now KVM is working on LE, enable it. Also enable transarent hugepage which has already been enabled on BE. Signed-off-by: Anton Blanchard Signed-off-by: Michael Ellerman --- arch/powerpc/configs/pseries_le_defconfig | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/configs/pseries_le_defconfig b/arch/powerpc/configs/pseries_le_defconfig index 4428ee428f4e..96a230cd5ca7 100644 --- a/arch/powerpc/configs/pseries_le_defconfig +++ b/arch/powerpc/configs/pseries_le_defconfig @@ -48,7 +48,6 @@ CONFIG_KEXEC=y CONFIG_IRQ_ALL_CPUS=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y -CONFIG_CMA=y CONFIG_PPC_64K_PAGES=y CONFIG_PPC_SUBPAGE_PROT=y CONFIG_SCHED_SMT=y @@ -137,6 +136,7 @@ CONFIG_NETCONSOLE=y CONFIG_NETPOLL_TRAP=y CONFIG_TUN=m CONFIG_VIRTIO_NET=m +CONFIG_VHOST_NET=m CONFIG_VORTEX=y CONFIG_ACENIC=m CONFIG_ACENIC_OMIT_TIGON_I=y @@ -302,4 +302,9 @@ CONFIG_CRYPTO_LZO=m # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_DEV_NX=y CONFIG_CRYPTO_DEV_NX_ENCRYPT=m +CONFIG_VIRTUALIZATION=y +CONFIG_KVM_BOOK3S_64=m +CONFIG_KVM_BOOK3S_64_HV=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y -- cgit v1.2.3 From e17ac6db2ef99388b750b2141c11974dc5742913 Mon Sep 17 00:00:00 2001 From: Ville Syrjälä Date: Thu, 9 Oct 2014 19:37:15 +0300 Subject: drm/i915: Don't trust the DP_DETECT bit for eDP ports on CHV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On CHV the display DDC pins may be muxed to an alternate function if there's no need for DDC on a specific port, which is the case for eDP ports since there's no way to plug in a DP++ HDMI dongle. This causes problems when trying to determine if the port is present since the the DP_DETECTED bit is the latched state of the DDC SDA pin at boot. If the DDC pins are muxed to an alternate function the bit may indicate that the port isn't present. To work around this look at the VBT as well as the DP_DETECTED bit to determine if we should attempt registering an eDP port. Do this only for ports B and C since port D doesn't support eDP (no PPS/BLC). In theory someone could also wire up a normal DP port w/o DDC lines. That would just mean that simple DP++ HDMI dongles wouldn't work on such a port. With this change we would still fail to register such DP ports. But let's hope no one wires their board in such a way, and if they do we can extend the VBT checks to cover normal DP ports as well. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=84265 Signed-off-by: Ville Syrjälä Reviewed-by: Damien Lespiau Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/intel_display.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c index 6a0cc367934d..c9e220963a78 100644 --- a/drivers/gpu/drm/i915/intel_display.c +++ b/drivers/gpu/drm/i915/intel_display.c @@ -12354,27 +12354,36 @@ static void intel_setup_outputs(struct drm_device *dev) if (I915_READ(PCH_DP_D) & DP_DETECTED) intel_dp_init(dev, PCH_DP_D, PORT_D); } else if (IS_VALLEYVIEW(dev)) { - if (I915_READ(VLV_DISPLAY_BASE + GEN4_HDMIB) & SDVO_DETECTED) { + /* + * The DP_DETECTED bit is the latched state of the DDC + * SDA pin at boot. However since eDP doesn't require DDC + * (no way to plug in a DP->HDMI dongle) the DDC pins for + * eDP ports may have been muxed to an alternate function. + * Thus we can't rely on the DP_DETECTED bit alone to detect + * eDP ports. Consult the VBT as well as DP_DETECTED to + * detect eDP ports. + */ + if (I915_READ(VLV_DISPLAY_BASE + GEN4_HDMIB) & SDVO_DETECTED) intel_hdmi_init(dev, VLV_DISPLAY_BASE + GEN4_HDMIB, PORT_B); - if (I915_READ(VLV_DISPLAY_BASE + DP_B) & DP_DETECTED) - intel_dp_init(dev, VLV_DISPLAY_BASE + DP_B, PORT_B); - } + if (I915_READ(VLV_DISPLAY_BASE + DP_B) & DP_DETECTED || + intel_dp_is_edp(dev, PORT_B)) + intel_dp_init(dev, VLV_DISPLAY_BASE + DP_B, PORT_B); - if (I915_READ(VLV_DISPLAY_BASE + GEN4_HDMIC) & SDVO_DETECTED) { + if (I915_READ(VLV_DISPLAY_BASE + GEN4_HDMIC) & SDVO_DETECTED) intel_hdmi_init(dev, VLV_DISPLAY_BASE + GEN4_HDMIC, PORT_C); - if (I915_READ(VLV_DISPLAY_BASE + DP_C) & DP_DETECTED) - intel_dp_init(dev, VLV_DISPLAY_BASE + DP_C, PORT_C); - } + if (I915_READ(VLV_DISPLAY_BASE + DP_C) & DP_DETECTED || + intel_dp_is_edp(dev, PORT_C)) + intel_dp_init(dev, VLV_DISPLAY_BASE + DP_C, PORT_C); if (IS_CHERRYVIEW(dev)) { - if (I915_READ(VLV_DISPLAY_BASE + CHV_HDMID) & SDVO_DETECTED) { + if (I915_READ(VLV_DISPLAY_BASE + CHV_HDMID) & SDVO_DETECTED) intel_hdmi_init(dev, VLV_DISPLAY_BASE + CHV_HDMID, PORT_D); - if (I915_READ(VLV_DISPLAY_BASE + DP_D) & DP_DETECTED) - intel_dp_init(dev, VLV_DISPLAY_BASE + DP_D, PORT_D); - } + /* eDP not supported on port D, so don't check VBT */ + if (I915_READ(VLV_DISPLAY_BASE + DP_D) & DP_DETECTED) + intel_dp_init(dev, VLV_DISPLAY_BASE + DP_D, PORT_D); } intel_dsi_init(dev); -- cgit v1.2.3 From 07c338ce98263a5af631b991dd8f96cff6ca2548 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 2 Oct 2014 11:16:32 +0300 Subject: drm/i915: fix short vs. long hpd detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix short vs. long hpd detection for non-g4x and non-pch split platforms. Broken since introduction in commit 13cf550448b58abf8f44f5d6a560f2d20871c965 Author: Dave Airlie Date: Wed Jun 18 11:29:35 2014 +1000 drm/i915: rework digital port IRQ handling (v2) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=83175 Reviewed-by: Ville Syrjälä Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_irq.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 54bf54706d1d..f66392b6e287 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -1711,7 +1711,7 @@ static irqreturn_t gen8_gt_irq_handler(struct drm_device *dev, #define HPD_STORM_DETECT_PERIOD 1000 #define HPD_STORM_THRESHOLD 5 -static int ilk_port_to_hotplug_shift(enum port port) +static int pch_port_to_hotplug_shift(enum port port) { switch (port) { case PORT_A: @@ -1727,7 +1727,7 @@ static int ilk_port_to_hotplug_shift(enum port port) } } -static int g4x_port_to_hotplug_shift(enum port port) +static int i915_port_to_hotplug_shift(enum port port) { switch (port) { case PORT_A: @@ -1785,12 +1785,12 @@ static inline void intel_hpd_irq_handler(struct drm_device *dev, if (port && dev_priv->hpd_irq_port[port]) { bool long_hpd; - if (IS_G4X(dev)) { - dig_shift = g4x_port_to_hotplug_shift(port); - long_hpd = (hotplug_trigger >> dig_shift) & PORTB_HOTPLUG_LONG_DETECT; - } else { - dig_shift = ilk_port_to_hotplug_shift(port); + if (HAS_PCH_SPLIT(dev)) { + dig_shift = pch_port_to_hotplug_shift(port); long_hpd = (dig_hotplug_reg >> dig_shift) & PORTB_HOTPLUG_LONG_DETECT; + } else { + dig_shift = i915_port_to_hotplug_shift(port); + long_hpd = (hotplug_trigger >> dig_shift) & PORTB_HOTPLUG_LONG_DETECT; } DRM_DEBUG_DRIVER("digital hpd port %c - %s\n", -- cgit v1.2.3 From e89dafb5ca5022d3bc63602018adfc766c73bc2b Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 16 Oct 2014 17:43:02 +1100 Subject: powerpc: Only do dynamic DMA zone limits on platforms that need it Scott's patch 1c98025c6c95 "Dynamic DMA zone limits" changed dma_direct_alloc_coherent() to start using dev->coherent_dma_mask. That seems fair enough, but it exposes the fact that some of the drivers we care about on IBM platforms aren't setting the coherent mask. The proper fix is to have drivers set the coherent mask and also have the platform code honor it. For now, just restrict the dynamic DMA zone limits to the platforms that need it. Signed-off-by: Michael Ellerman Acked-by: Scott Wood --- arch/powerpc/kernel/dma.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/dma.c b/arch/powerpc/kernel/dma.c index adac9dc54aee..484b2d4462c1 100644 --- a/arch/powerpc/kernel/dma.c +++ b/arch/powerpc/kernel/dma.c @@ -53,9 +53,16 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size, #else struct page *page; int node = dev_to_node(dev); +#ifdef CONFIG_FSL_SOC u64 pfn = get_pfn_limit(dev); int zone; + /* + * This code should be OK on other platforms, but we have drivers that + * don't set coherent_dma_mask. As a workaround we just ifdef it. This + * whole routine needs some serious cleanup. + */ + zone = dma_pfn_limit_to_zone(pfn); if (zone < 0) { dev_err(dev, "%s: No suitable zone for pfn %#llx\n", @@ -73,6 +80,7 @@ void *dma_direct_alloc_coherent(struct device *dev, size_t size, break; #endif }; +#endif /* CONFIG_FSL_SOC */ /* ignore region specifiers */ flag &= ~(__GFP_HIGHMEM); -- cgit v1.2.3 From 83d04c39f9048807a8500e575ae3f1718a3f45bb Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 13 Oct 2014 13:23:48 -0400 Subject: drm/radeon: initialize sadb to NULL in the audio code Fixes kfree of the sadb buffer when it's NULL. Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/dce3_1_afmt.c | 2 +- drivers/gpu/drm/radeon/dce6_afmt.c | 2 +- drivers/gpu/drm/radeon/evergreen_hdmi.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/radeon/dce3_1_afmt.c b/drivers/gpu/drm/radeon/dce3_1_afmt.c index 950af153f30e..6b1dbecdd36d 100644 --- a/drivers/gpu/drm/radeon/dce3_1_afmt.c +++ b/drivers/gpu/drm/radeon/dce3_1_afmt.c @@ -32,7 +32,7 @@ static void dce3_2_afmt_write_speaker_allocation(struct drm_encoder *encoder) struct drm_connector *connector; struct radeon_connector *radeon_connector = NULL; u32 tmp; - u8 *sadb; + u8 *sadb = NULL; int sad_count; list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { diff --git a/drivers/gpu/drm/radeon/dce6_afmt.c b/drivers/gpu/drm/radeon/dce6_afmt.c index c0bbf68dbc27..960a5f0f042f 100644 --- a/drivers/gpu/drm/radeon/dce6_afmt.c +++ b/drivers/gpu/drm/radeon/dce6_afmt.c @@ -155,7 +155,7 @@ void dce6_afmt_write_speaker_allocation(struct drm_encoder *encoder) struct drm_connector *connector; struct radeon_connector *radeon_connector = NULL; u32 offset, tmp; - u8 *sadb; + u8 *sadb = NULL; int sad_count; if (!dig || !dig->afmt || !dig->afmt->pin) diff --git a/drivers/gpu/drm/radeon/evergreen_hdmi.c b/drivers/gpu/drm/radeon/evergreen_hdmi.c index 2514d659b1ba..f6a5c3026f85 100644 --- a/drivers/gpu/drm/radeon/evergreen_hdmi.c +++ b/drivers/gpu/drm/radeon/evergreen_hdmi.c @@ -133,7 +133,7 @@ static void dce4_afmt_write_speaker_allocation(struct drm_encoder *encoder) struct drm_connector *connector; struct radeon_connector *radeon_connector = NULL; u32 tmp; - u8 *sadb; + u8 *sadb = NULL; int sad_count; list_for_each_entry(connector, &encoder->dev->mode_config.connector_list, head) { -- cgit v1.2.3 From 4910403836ded89803fab201d4b5caaa85de3a89 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 13 Oct 2014 11:51:50 -0400 Subject: drm/radeon: fix speaker allocation setup If the sad_count is 0, set the hw to stereo and change the error message to a warn. A lot of monitors don't set the speaker allocation block. Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/dce3_1_afmt.c | 4 ++-- drivers/gpu/drm/radeon/dce6_afmt.c | 6 +++--- drivers/gpu/drm/radeon/evergreen_hdmi.c | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/radeon/dce3_1_afmt.c b/drivers/gpu/drm/radeon/dce3_1_afmt.c index 6b1dbecdd36d..2fe8cfc966d9 100644 --- a/drivers/gpu/drm/radeon/dce3_1_afmt.c +++ b/drivers/gpu/drm/radeon/dce3_1_afmt.c @@ -49,8 +49,8 @@ static void dce3_2_afmt_write_speaker_allocation(struct drm_encoder *encoder) sad_count = drm_edid_to_speaker_allocation(radeon_connector->edid, &sadb); if (sad_count < 0) { - DRM_ERROR("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); - return; + DRM_DEBUG("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); + sad_count = 0; } /* program the speaker allocation */ diff --git a/drivers/gpu/drm/radeon/dce6_afmt.c b/drivers/gpu/drm/radeon/dce6_afmt.c index 960a5f0f042f..f312edf4d50e 100644 --- a/drivers/gpu/drm/radeon/dce6_afmt.c +++ b/drivers/gpu/drm/radeon/dce6_afmt.c @@ -176,9 +176,9 @@ void dce6_afmt_write_speaker_allocation(struct drm_encoder *encoder) } sad_count = drm_edid_to_speaker_allocation(radeon_connector_edid(connector), &sadb); - if (sad_count <= 0) { - DRM_ERROR("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); - return; + if (sad_count < 0) { + DRM_DEBUG("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); + sad_count = 0; } /* program the speaker allocation */ diff --git a/drivers/gpu/drm/radeon/evergreen_hdmi.c b/drivers/gpu/drm/radeon/evergreen_hdmi.c index f6a5c3026f85..53abd9b17a50 100644 --- a/drivers/gpu/drm/radeon/evergreen_hdmi.c +++ b/drivers/gpu/drm/radeon/evergreen_hdmi.c @@ -149,9 +149,9 @@ static void dce4_afmt_write_speaker_allocation(struct drm_encoder *encoder) } sad_count = drm_edid_to_speaker_allocation(radeon_connector_edid(connector), &sadb); - if (sad_count <= 0) { - DRM_ERROR("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); - return; + if (sad_count < 0) { + DRM_DEBUG("Couldn't read Speaker Allocation Data Block: %d\n", sad_count); + sad_count = 0; } /* program the speaker allocation */ -- cgit v1.2.3 From adfed2b0587289013f8143c54913ddfd44ac1fd3 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 13 Oct 2014 13:20:02 -0400 Subject: drm/radeon: use gart memory for DMA ring tests Avoids HDP cache flush issues when using vram which can cause ring test failures on certain boards. Signed-off-by: Alex Deucher Cc: Alexander Fyodorov Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/cik_sdma.c | 21 ++++++++++++--------- drivers/gpu/drm/radeon/r600_dma.c | 21 ++++++++++++--------- drivers/gpu/drm/radeon/radeon.h | 2 ++ 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/radeon/cik_sdma.c b/drivers/gpu/drm/radeon/cik_sdma.c index c473c9125295..7deb2ef4da32 100644 --- a/drivers/gpu/drm/radeon/cik_sdma.c +++ b/drivers/gpu/drm/radeon/cik_sdma.c @@ -618,16 +618,19 @@ int cik_sdma_ring_test(struct radeon_device *rdev, { unsigned i; int r; - void __iomem *ptr = (void *)rdev->vram_scratch.ptr; + unsigned index; u32 tmp; + u64 gpu_addr; - if (!ptr) { - DRM_ERROR("invalid vram scratch pointer\n"); - return -EINVAL; - } + if (ring->idx == R600_RING_TYPE_DMA_INDEX) + index = R600_WB_DMA_RING_TEST_OFFSET; + else + index = CAYMAN_WB_DMA1_RING_TEST_OFFSET; + + gpu_addr = rdev->wb.gpu_addr + index; tmp = 0xCAFEDEAD; - writel(tmp, ptr); + rdev->wb.wb[index/4] = cpu_to_le32(tmp); r = radeon_ring_lock(rdev, ring, 5); if (r) { @@ -635,14 +638,14 @@ int cik_sdma_ring_test(struct radeon_device *rdev, return r; } radeon_ring_write(ring, SDMA_PACKET(SDMA_OPCODE_WRITE, SDMA_WRITE_SUB_OPCODE_LINEAR, 0)); - radeon_ring_write(ring, rdev->vram_scratch.gpu_addr & 0xfffffffc); - radeon_ring_write(ring, upper_32_bits(rdev->vram_scratch.gpu_addr)); + radeon_ring_write(ring, lower_32_bits(gpu_addr)); + radeon_ring_write(ring, upper_32_bits(gpu_addr)); radeon_ring_write(ring, 1); /* number of DWs to follow */ radeon_ring_write(ring, 0xDEADBEEF); radeon_ring_unlock_commit(rdev, ring, false); for (i = 0; i < rdev->usec_timeout; i++) { - tmp = readl(ptr); + tmp = le32_to_cpu(rdev->wb.wb[index/4]); if (tmp == 0xDEADBEEF) break; DRM_UDELAY(1); diff --git a/drivers/gpu/drm/radeon/r600_dma.c b/drivers/gpu/drm/radeon/r600_dma.c index a49db830a47f..d9375a361985 100644 --- a/drivers/gpu/drm/radeon/r600_dma.c +++ b/drivers/gpu/drm/radeon/r600_dma.c @@ -241,16 +241,19 @@ int r600_dma_ring_test(struct radeon_device *rdev, { unsigned i; int r; - void __iomem *ptr = (void *)rdev->vram_scratch.ptr; + unsigned index; u32 tmp; + u64 gpu_addr; - if (!ptr) { - DRM_ERROR("invalid vram scratch pointer\n"); - return -EINVAL; - } + if (ring->idx == R600_RING_TYPE_DMA_INDEX) + index = R600_WB_DMA_RING_TEST_OFFSET; + else + index = CAYMAN_WB_DMA1_RING_TEST_OFFSET; + + gpu_addr = rdev->wb.gpu_addr + index; tmp = 0xCAFEDEAD; - writel(tmp, ptr); + rdev->wb.wb[index/4] = cpu_to_le32(tmp); r = radeon_ring_lock(rdev, ring, 4); if (r) { @@ -258,13 +261,13 @@ int r600_dma_ring_test(struct radeon_device *rdev, return r; } radeon_ring_write(ring, DMA_PACKET(DMA_PACKET_WRITE, 0, 0, 1)); - radeon_ring_write(ring, rdev->vram_scratch.gpu_addr & 0xfffffffc); - radeon_ring_write(ring, upper_32_bits(rdev->vram_scratch.gpu_addr) & 0xff); + radeon_ring_write(ring, lower_32_bits(gpu_addr)); + radeon_ring_write(ring, upper_32_bits(gpu_addr) & 0xff); radeon_ring_write(ring, 0xDEADBEEF); radeon_ring_unlock_commit(rdev, ring, false); for (i = 0; i < rdev->usec_timeout; i++) { - tmp = readl(ptr); + tmp = le32_to_cpu(rdev->wb.wb[index/4]); if (tmp == 0xDEADBEEF) break; DRM_UDELAY(1); diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index e01424fe2848..588672d4fd59 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -1132,6 +1132,8 @@ struct radeon_wb { #define R600_WB_EVENT_OFFSET 3072 #define CIK_WB_CP1_WPTR_OFFSET 3328 #define CIK_WB_CP2_WPTR_OFFSET 3584 +#define R600_WB_DMA_RING_TEST_OFFSET 3588 +#define CAYMAN_WB_DMA1_RING_TEST_OFFSET 3592 /** * struct radeon_pm - power management datas -- cgit v1.2.3 From 9ace2ef7b78e573cedead0f08052b028181e6eaf Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Thu, 9 Oct 2014 15:03:03 +0900 Subject: drm/ttm: Don't skip fpfn check if lpfn is 0 in ttm_bo_mem_compat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Christian König Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher --- drivers/gpu/drm/ttm/ttm_bo.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 8f5cec67c47d..ba74a11bceb7 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -994,9 +994,9 @@ static bool ttm_bo_mem_compat(struct ttm_placement *placement, for (i = 0; i < placement->num_placement; i++) { const struct ttm_place *heap = &placement->placement[i]; - if (mem->mm_node && heap->lpfn != 0 && + if (mem->mm_node && (mem->start < heap->fpfn || - mem->start + mem->num_pages > heap->lpfn)) + (heap->lpfn != 0 && (mem->start + mem->num_pages) > heap->lpfn))) continue; *new_flags = heap->flags; @@ -1007,9 +1007,9 @@ static bool ttm_bo_mem_compat(struct ttm_placement *placement, for (i = 0; i < placement->num_busy_placement; i++) { const struct ttm_place *heap = &placement->busy_placement[i]; - if (mem->mm_node && heap->lpfn != 0 && + if (mem->mm_node && (mem->start < heap->fpfn || - mem->start + mem->num_pages > heap->lpfn)) + (heap->lpfn != 0 && (mem->start + mem->num_pages) > heap->lpfn))) continue; *new_flags = heap->flags; -- cgit v1.2.3 From e300180f71037fd9ed1ca967006fd9f3ee466bcd Mon Sep 17 00:00:00 2001 From: Michel Dänzer Date: Thu, 9 Oct 2014 15:02:59 +0900 Subject: drm/ttm: Don't evict BOs outside of the requested placement range MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The radeon driver uses placement range restrictions for several reasons, in particular to make sure BOs in VRAM can be accessed by the CPU, e.g. during a page fault. Without this change, TTM could evict other BOs while trying to satisfy the requested placement, even if the evicted BOs were outside of the requested placement range. Doing so didn't free up any space in the requested placement range, so the (potentially high) eviction cost was incurred for no benefit. Nominating for stable because radeon driver changes in 3.17 made this much more noticeable than before. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=84662 Cc: stable@vger.kernel.org Signed-off-by: Michel Dänzer Signed-off-by: Alex Deucher --- drivers/gpu/drm/ttm/ttm_bo.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index ba74a11bceb7..d395b0bef73b 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -709,6 +709,7 @@ out: static int ttm_mem_evict_first(struct ttm_bo_device *bdev, uint32_t mem_type, + const struct ttm_place *place, bool interruptible, bool no_wait_gpu) { @@ -720,8 +721,21 @@ static int ttm_mem_evict_first(struct ttm_bo_device *bdev, spin_lock(&glob->lru_lock); list_for_each_entry(bo, &man->lru, lru) { ret = __ttm_bo_reserve(bo, false, true, false, NULL); - if (!ret) + if (!ret) { + if (place && (place->fpfn || place->lpfn)) { + /* Don't evict this BO if it's outside of the + * requested placement range + */ + if (place->fpfn >= (bo->mem.start + bo->mem.size) || + (place->lpfn && place->lpfn <= bo->mem.start)) { + __ttm_bo_unreserve(bo); + ret = -EBUSY; + continue; + } + } + break; + } } if (ret) { @@ -782,7 +796,7 @@ static int ttm_bo_mem_force_space(struct ttm_buffer_object *bo, return ret; if (mem->mm_node) break; - ret = ttm_mem_evict_first(bdev, mem_type, + ret = ttm_mem_evict_first(bdev, mem_type, place, interruptible, no_wait_gpu); if (unlikely(ret != 0)) return ret; @@ -1233,7 +1247,7 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev, spin_lock(&glob->lru_lock); while (!list_empty(&man->lru)) { spin_unlock(&glob->lru_lock); - ret = ttm_mem_evict_first(bdev, mem_type, false, false); + ret = ttm_mem_evict_first(bdev, mem_type, NULL, false, false); if (ret) { if (allow_errors) { return ret; -- cgit v1.2.3 From 8e66e134e20b936179ea1535dd4ed19ec4f99dba Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 15 Oct 2014 17:20:55 -0400 Subject: drm/radeon: fix vm page table block size calculation The page offset is 12 bits. For example if we have an 8 GB VM, we'd need 33 bits. The number of bits needed for PD + PT is 21 (33 - 12 or log2(8) + 18), not 20 (log2(8) + 17). Noticed by Alexey during code review. Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/radeon/radeon_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 6fbab1582112..55065d844205 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -1126,7 +1126,7 @@ static void radeon_check_arguments(struct radeon_device *rdev) if (radeon_vm_block_size == -1) { /* Total bits covered by PD + PTs */ - unsigned bits = ilog2(radeon_vm_size) + 17; + unsigned bits = ilog2(radeon_vm_size) + 18; /* Make sure the PD is 4K in size up to 8GB address space. Above that split equal between PD and PTs */ -- cgit v1.2.3 From 01467a9b5e7ec7b9e30768bee16ea5861665015b Mon Sep 17 00:00:00 2001 From: Michele Curti Date: Tue, 14 Oct 2014 18:25:09 +0200 Subject: drm/radeon: reduce sparse false positive warnings include radeon_asic.h header file in the various xxx_dpm.c files to reduce sparse false positive warnings. Not so great patch in itself, but reducing warning count from 391 to 258 may help to see real problems.. Signed-off-by: Michele Curti Signed-off-by: Alex Deucher --- drivers/gpu/drm/radeon/btc_dpm.c | 1 + drivers/gpu/drm/radeon/ci_dpm.c | 1 + drivers/gpu/drm/radeon/cypress_dpm.c | 1 + drivers/gpu/drm/radeon/ni_dpm.c | 1 + drivers/gpu/drm/radeon/r600_dpm.c | 1 + drivers/gpu/drm/radeon/rs780_dpm.c | 1 + drivers/gpu/drm/radeon/rv6xx_dpm.c | 1 + drivers/gpu/drm/radeon/rv770_dpm.c | 1 + drivers/gpu/drm/radeon/si_dpm.c | 1 + drivers/gpu/drm/radeon/sumo_dpm.c | 1 + drivers/gpu/drm/radeon/trinity_dpm.c | 1 + 11 files changed, 11 insertions(+) diff --git a/drivers/gpu/drm/radeon/btc_dpm.c b/drivers/gpu/drm/radeon/btc_dpm.c index b7181607e421..0b2929de9f41 100644 --- a/drivers/gpu/drm/radeon/btc_dpm.c +++ b/drivers/gpu/drm/radeon/btc_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "btcd.h" #include "r600_dpm.h" #include "cypress_dpm.h" diff --git a/drivers/gpu/drm/radeon/ci_dpm.c b/drivers/gpu/drm/radeon/ci_dpm.c index f5c8c0445a94..11a55e9dad7f 100644 --- a/drivers/gpu/drm/radeon/ci_dpm.c +++ b/drivers/gpu/drm/radeon/ci_dpm.c @@ -24,6 +24,7 @@ #include #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "radeon_ucode.h" #include "cikd.h" #include "r600_dpm.h" diff --git a/drivers/gpu/drm/radeon/cypress_dpm.c b/drivers/gpu/drm/radeon/cypress_dpm.c index 47d31e915758..9aad0327e4d1 100644 --- a/drivers/gpu/drm/radeon/cypress_dpm.c +++ b/drivers/gpu/drm/radeon/cypress_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "evergreend.h" #include "r600_dpm.h" #include "cypress_dpm.h" diff --git a/drivers/gpu/drm/radeon/ni_dpm.c b/drivers/gpu/drm/radeon/ni_dpm.c index 715b181c6243..6d2f16cf2c1c 100644 --- a/drivers/gpu/drm/radeon/ni_dpm.c +++ b/drivers/gpu/drm/radeon/ni_dpm.c @@ -23,6 +23,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "nid.h" #include "r600_dpm.h" #include "ni_dpm.h" diff --git a/drivers/gpu/drm/radeon/r600_dpm.c b/drivers/gpu/drm/radeon/r600_dpm.c index 9c61b74ef441..f6309bd23e01 100644 --- a/drivers/gpu/drm/radeon/r600_dpm.c +++ b/drivers/gpu/drm/radeon/r600_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "r600d.h" #include "r600_dpm.h" #include "atom.h" diff --git a/drivers/gpu/drm/radeon/rs780_dpm.c b/drivers/gpu/drm/radeon/rs780_dpm.c index 02f7710de470..9031f4b69824 100644 --- a/drivers/gpu/drm/radeon/rs780_dpm.c +++ b/drivers/gpu/drm/radeon/rs780_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "rs780d.h" #include "r600_dpm.h" #include "rs780_dpm.h" diff --git a/drivers/gpu/drm/radeon/rv6xx_dpm.c b/drivers/gpu/drm/radeon/rv6xx_dpm.c index e7045b085715..6a5c233361e9 100644 --- a/drivers/gpu/drm/radeon/rv6xx_dpm.c +++ b/drivers/gpu/drm/radeon/rv6xx_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "rv6xxd.h" #include "r600_dpm.h" #include "rv6xx_dpm.h" diff --git a/drivers/gpu/drm/radeon/rv770_dpm.c b/drivers/gpu/drm/radeon/rv770_dpm.c index 3c76e1dcdf04..755a8f96fe46 100644 --- a/drivers/gpu/drm/radeon/rv770_dpm.c +++ b/drivers/gpu/drm/radeon/rv770_dpm.c @@ -24,6 +24,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "rv770d.h" #include "r600_dpm.h" #include "rv770_dpm.h" diff --git a/drivers/gpu/drm/radeon/si_dpm.c b/drivers/gpu/drm/radeon/si_dpm.c index 70e61ffeace2..a53c2e79d9cb 100644 --- a/drivers/gpu/drm/radeon/si_dpm.c +++ b/drivers/gpu/drm/radeon/si_dpm.c @@ -23,6 +23,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "sid.h" #include "r600_dpm.h" #include "si_dpm.h" diff --git a/drivers/gpu/drm/radeon/sumo_dpm.c b/drivers/gpu/drm/radeon/sumo_dpm.c index 3f0e8d7b8dbe..1f8a8833e1be 100644 --- a/drivers/gpu/drm/radeon/sumo_dpm.c +++ b/drivers/gpu/drm/radeon/sumo_dpm.c @@ -23,6 +23,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "sumod.h" #include "r600_dpm.h" #include "cypress_dpm.h" diff --git a/drivers/gpu/drm/radeon/trinity_dpm.c b/drivers/gpu/drm/radeon/trinity_dpm.c index 57f780053b3e..b4ec5c4e7969 100644 --- a/drivers/gpu/drm/radeon/trinity_dpm.c +++ b/drivers/gpu/drm/radeon/trinity_dpm.c @@ -23,6 +23,7 @@ #include "drmP.h" #include "radeon.h" +#include "radeon_asic.h" #include "trinityd.h" #include "r600_dpm.h" #include "trinity_dpm.h" -- cgit v1.2.3 From d8054749c6795073cb427465a726213d45898f68 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 19 Jun 2014 15:43:29 +0800 Subject: Thermal: int340x thermal: select ACPI fan driver we share the same driver for both ACPI predefined Fan device and INT3404 Fan device, thus we should select the ACPI Fan driver when int340x thermal drivers are enabeld. Signed-off-by: Zhang Rui --- drivers/acpi/Kconfig | 2 +- drivers/thermal/Kconfig | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index d0f3265fb85d..b23fe37f67c0 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -144,7 +144,7 @@ config ACPI_VIDEO config ACPI_FAN tristate "Fan" - select THERMAL + depends on THERMAL default y help This driver supports ACPI fan devices, allowing user-mode diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index 3a8929222cab..e65ca2f8917e 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -224,6 +224,7 @@ config INT340X_THERMAL depends on X86 && ACPI select THERMAL_GOV_USER_SPACE select ACPI_THERMAL_REL + select ACPI_FAN help Newer laptops and tablets that use ACPI may have thermal sensors and other devices with thermal control capabilities outside the core -- cgit v1.2.3 From fcb1c2d75b55fe52c40cd9692fda73da061f19aa Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 9 Oct 2014 12:58:25 +0200 Subject: s390: wire up bpf syscall Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/include/uapi/asm/unistd.h | 3 ++- arch/s390/kernel/compat_wrapper.c | 1 + arch/s390/kernel/syscalls.S | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h index 940ac49198db..4197c89c52d4 100644 --- a/arch/s390/include/uapi/asm/unistd.h +++ b/arch/s390/include/uapi/asm/unistd.h @@ -286,7 +286,8 @@ #define __NR_seccomp 348 #define __NR_getrandom 349 #define __NR_memfd_create 350 -#define NR_syscalls 351 +#define __NR_bpf 351 +#define NR_syscalls 352 /* * There are some system calls that are not present on 64 bit, some diff --git a/arch/s390/kernel/compat_wrapper.c b/arch/s390/kernel/compat_wrapper.c index faf6caa510dc..c4f7a3d655b8 100644 --- a/arch/s390/kernel/compat_wrapper.c +++ b/arch/s390/kernel/compat_wrapper.c @@ -217,3 +217,4 @@ COMPAT_SYSCALL_WRAP5(renameat2, int, olddfd, const char __user *, oldname, int, COMPAT_SYSCALL_WRAP3(seccomp, unsigned int, op, unsigned int, flags, const char __user *, uargs) COMPAT_SYSCALL_WRAP3(getrandom, char __user *, buf, size_t, count, unsigned int, flags) COMPAT_SYSCALL_WRAP2(memfd_create, const char __user *, uname, unsigned int, flags) +COMPAT_SYSCALL_WRAP3(bpf, int, cmd, union bpf_attr *, attr, unsigned int, size); diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 6fe886ac2db5..9f7087fd58de 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -359,3 +359,4 @@ SYSCALL(sys_renameat2,sys_renameat2,compat_sys_renameat2) SYSCALL(sys_seccomp,sys_seccomp,compat_sys_seccomp) SYSCALL(sys_getrandom,sys_getrandom,compat_sys_getrandom) SYSCALL(sys_memfd_create,sys_memfd_create,compat_sys_memfd_create) /* 350 */ +SYSCALL(sys_bpf,sys_bpf,compat_sys_bpf) -- cgit v1.2.3 From d6fe5be34cf03e7db36d99c1b9d8e472ad3bdb87 Mon Sep 17 00:00:00 2001 From: Jan Willeke Date: Wed, 8 Oct 2014 10:16:08 +0200 Subject: s390/uprobes: fix kprobes dependency If kprobes is disabled uprobes will not compile. Fix this by including the correct header files. Signed-off-by: Jan Willeke Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/uprobes.c | 2 +- arch/s390/lib/probes.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c index 956f4f7a591c..f6b3cd056ec2 100644 --- a/arch/s390/kernel/uprobes.c +++ b/arch/s390/kernel/uprobes.c @@ -5,13 +5,13 @@ * Author(s): Jan Willeke, */ -#include #include #include #include #include #include #include +#include #include #include "entry.h" diff --git a/arch/s390/lib/probes.c b/arch/s390/lib/probes.c index c5d64a099719..ae90e1ae3607 100644 --- a/arch/s390/lib/probes.c +++ b/arch/s390/lib/probes.c @@ -4,7 +4,7 @@ * Copyright IBM Corp. 2014 */ -#include +#include #include int probe_is_prohibited_opcode(u16 *insn) -- cgit v1.2.3 From d4c5efdb97773f59a2b711754ca0953f24516739 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 26 Aug 2014 23:16:35 -0400 Subject: random: add and use memzero_explicit() for clearing data zatimend has reported that in his environment (3.16/gcc4.8.3/corei7) memset() calls which clear out sensitive data in extract_{buf,entropy, entropy_user}() in random driver are being optimized away by gcc. Add a helper memzero_explicit() (similarly as explicit_bzero() variants) that can be used in such cases where a variable with sensitive data is being cleared out in the end. Other use cases might also be in crypto code. [ I have put this into lib/string.c though, as it's always built-in and doesn't need any dependencies then. ] Fixes kernel bugzilla: 82041 Reported-by: zatimend@hotmail.co.uk Signed-off-by: Daniel Borkmann Acked-by: Hannes Frederic Sowa Cc: Alexey Dobriyan Signed-off-by: Theodore Ts'o Cc: stable@vger.kernel.org --- drivers/char/random.c | 8 ++++---- include/linux/string.h | 5 +++-- lib/string.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index c18d41db83d8..8c86a95203a0 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1106,7 +1106,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) __mix_pool_bytes(r, hash.w, sizeof(hash.w)); spin_unlock_irqrestore(&r->lock, flags); - memset(workspace, 0, sizeof(workspace)); + memzero_explicit(workspace, sizeof(workspace)); /* * In case the hash function has some recognizable output @@ -1118,7 +1118,7 @@ static void extract_buf(struct entropy_store *r, __u8 *out) hash.w[2] ^= rol32(hash.w[2], 16); memcpy(out, &hash, EXTRACT_SIZE); - memset(&hash, 0, sizeof(hash)); + memzero_explicit(&hash, sizeof(hash)); } /* @@ -1175,7 +1175,7 @@ static ssize_t extract_entropy(struct entropy_store *r, void *buf, } /* Wipe data just returned from memory */ - memset(tmp, 0, sizeof(tmp)); + memzero_explicit(tmp, sizeof(tmp)); return ret; } @@ -1218,7 +1218,7 @@ static ssize_t extract_entropy_user(struct entropy_store *r, void __user *buf, } /* Wipe data just returned from memory */ - memset(tmp, 0, sizeof(tmp)); + memzero_explicit(tmp, sizeof(tmp)); return ret; } diff --git a/include/linux/string.h b/include/linux/string.h index d36977e029af..3b42b3732da6 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -132,7 +132,7 @@ int bprintf(u32 *bin_buf, size_t size, const char *fmt, ...) __printf(3, 4); #endif extern ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, - const void *from, size_t available); + const void *from, size_t available); /** * strstarts - does @str start with @prefix? @@ -144,7 +144,8 @@ static inline bool strstarts(const char *str, const char *prefix) return strncmp(str, prefix, strlen(prefix)) == 0; } -extern size_t memweight(const void *ptr, size_t bytes); +size_t memweight(const void *ptr, size_t bytes); +void memzero_explicit(void *s, size_t count); /** * kbasename - return the last part of a pathname. diff --git a/lib/string.c b/lib/string.c index 992bf30af759..3a3120452a1d 100644 --- a/lib/string.c +++ b/lib/string.c @@ -604,6 +604,22 @@ void *memset(void *s, int c, size_t count) EXPORT_SYMBOL(memset); #endif +/** + * memzero_explicit - Fill a region of memory (e.g. sensitive + * keying data) with 0s. + * @s: Pointer to the start of the area. + * @count: The size of the area. + * + * memzero_explicit() doesn't need an arch-specific version as + * it just invokes the one of memset() implicitly. + */ +void memzero_explicit(void *s, size_t count) +{ + memset(s, 0, count); + OPTIMIZER_HIDE_VAR(s); +} +EXPORT_SYMBOL(memzero_explicit); + #ifndef __HAVE_ARCH_MEMCPY /** * memcpy - Copy one area of memory to another -- cgit v1.2.3 From 7185ad2672a7d50bc384de0e38d90b75d99f3d82 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 7 Sep 2014 23:23:38 +0200 Subject: crypto: memzero_explicit - make sure to clear out sensitive data Recently, in commit 13aa93c70e71 ("random: add and use memzero_explicit() for clearing data"), we have found that GCC may optimize some memset() cases away when it detects a stack variable is not being used anymore and going out of scope. This can happen, for example, in cases when we are clearing out sensitive information such as keying material or any e.g. intermediate results from crypto computations, etc. With the help of Coccinelle, we can figure out and fix such occurences in the crypto subsytem as well. Julia Lawall provided the following Coccinelle program: @@ type T; identifier x; @@ T x; ... when exists when any -memset +memzero_explicit (&x, -0, ...) ... when != x when strict @@ type T; identifier x; @@ T x[...]; ... when exists when any -memset +memzero_explicit (x, -0, ...) ... when != x when strict Therefore, make use of the drop-in replacement memzero_explicit() for exactly such cases instead of using memset(). Signed-off-by: Daniel Borkmann Cc: Julia Lawall Cc: Herbert Xu Cc: Theodore Ts'o Cc: Hannes Frederic Sowa Acked-by: Hannes Frederic Sowa Acked-by: Herbert Xu Signed-off-by: Theodore Ts'o --- crypto/cts.c | 3 ++- crypto/sha1_generic.c | 2 +- crypto/sha256_generic.c | 5 ++--- crypto/sha512_generic.c | 2 +- crypto/tgr192.c | 4 ++-- crypto/vmac.c | 2 +- crypto/wp512.c | 8 ++++---- 7 files changed, 13 insertions(+), 13 deletions(-) diff --git a/crypto/cts.c b/crypto/cts.c index 042223f8e733..133f0874c95e 100644 --- a/crypto/cts.c +++ b/crypto/cts.c @@ -202,7 +202,8 @@ static int cts_cbc_decrypt(struct crypto_cts_ctx *ctx, /* 5. Append the tail (BB - Ln) bytes of Xn (tmp) to Cn to create En */ memcpy(s + bsize + lastn, tmp + lastn, bsize - lastn); /* 6. Decrypt En to create Pn-1 */ - memset(iv, 0, sizeof(iv)); + memzero_explicit(iv, sizeof(iv)); + sg_set_buf(&sgsrc[0], s + bsize, bsize); sg_set_buf(&sgdst[0], d, bsize); err = crypto_blkcipher_decrypt_iv(&lcldesc, sgdst, sgsrc, bsize); diff --git a/crypto/sha1_generic.c b/crypto/sha1_generic.c index 42794803c480..7bb047432782 100644 --- a/crypto/sha1_generic.c +++ b/crypto/sha1_generic.c @@ -64,7 +64,7 @@ int crypto_sha1_update(struct shash_desc *desc, const u8 *data, src = data + done; } while (done + SHA1_BLOCK_SIZE <= len); - memset(temp, 0, sizeof(temp)); + memzero_explicit(temp, sizeof(temp)); partial = 0; } memcpy(sctx->buffer + partial, src, len - done); diff --git a/crypto/sha256_generic.c b/crypto/sha256_generic.c index 543366779524..32c5e5ea205a 100644 --- a/crypto/sha256_generic.c +++ b/crypto/sha256_generic.c @@ -210,10 +210,9 @@ static void sha256_transform(u32 *state, const u8 *input) /* clear any sensitive info... */ a = b = c = d = e = f = g = h = t1 = t2 = 0; - memset(W, 0, 64 * sizeof(u32)); + memzero_explicit(W, 64 * sizeof(u32)); } - static int sha224_init(struct shash_desc *desc) { struct sha256_state *sctx = shash_desc_ctx(desc); @@ -316,7 +315,7 @@ static int sha224_final(struct shash_desc *desc, u8 *hash) sha256_final(desc, D); memcpy(hash, D, SHA224_DIGEST_SIZE); - memset(D, 0, SHA256_DIGEST_SIZE); + memzero_explicit(D, SHA256_DIGEST_SIZE); return 0; } diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c index 6ed124f3ea0f..04d295a8bc08 100644 --- a/crypto/sha512_generic.c +++ b/crypto/sha512_generic.c @@ -238,7 +238,7 @@ static int sha384_final(struct shash_desc *desc, u8 *hash) sha512_final(desc, D); memcpy(hash, D, 48); - memset(D, 0, 64); + memzero_explicit(D, 64); return 0; } diff --git a/crypto/tgr192.c b/crypto/tgr192.c index 87403556fd0b..3c7af0d1ff7a 100644 --- a/crypto/tgr192.c +++ b/crypto/tgr192.c @@ -612,7 +612,7 @@ static int tgr160_final(struct shash_desc *desc, u8 * out) tgr192_final(desc, D); memcpy(out, D, TGR160_DIGEST_SIZE); - memset(D, 0, TGR192_DIGEST_SIZE); + memzero_explicit(D, TGR192_DIGEST_SIZE); return 0; } @@ -623,7 +623,7 @@ static int tgr128_final(struct shash_desc *desc, u8 * out) tgr192_final(desc, D); memcpy(out, D, TGR128_DIGEST_SIZE); - memset(D, 0, TGR192_DIGEST_SIZE); + memzero_explicit(D, TGR192_DIGEST_SIZE); return 0; } diff --git a/crypto/vmac.c b/crypto/vmac.c index 2eb11a30c29c..d84c24bd7ff7 100644 --- a/crypto/vmac.c +++ b/crypto/vmac.c @@ -613,7 +613,7 @@ static int vmac_final(struct shash_desc *pdesc, u8 *out) } mac = vmac(ctx->partial, ctx->partial_size, nonce, NULL, ctx); memcpy(out, &mac, sizeof(vmac_t)); - memset(&mac, 0, sizeof(vmac_t)); + memzero_explicit(&mac, sizeof(vmac_t)); memset(&ctx->__vmac_ctx, 0, sizeof(struct vmac_ctx)); ctx->partial_size = 0; return 0; diff --git a/crypto/wp512.c b/crypto/wp512.c index 180f1d6e03f4..ec64e7762fbb 100644 --- a/crypto/wp512.c +++ b/crypto/wp512.c @@ -1102,8 +1102,8 @@ static int wp384_final(struct shash_desc *desc, u8 *out) u8 D[64]; wp512_final(desc, D); - memcpy (out, D, WP384_DIGEST_SIZE); - memset (D, 0, WP512_DIGEST_SIZE); + memcpy(out, D, WP384_DIGEST_SIZE); + memzero_explicit(D, WP512_DIGEST_SIZE); return 0; } @@ -1113,8 +1113,8 @@ static int wp256_final(struct shash_desc *desc, u8 *out) u8 D[64]; wp512_final(desc, D); - memcpy (out, D, WP256_DIGEST_SIZE); - memset (D, 0, WP512_DIGEST_SIZE); + memcpy(out, D, WP256_DIGEST_SIZE); + memzero_explicit(D, WP512_DIGEST_SIZE); return 0; } -- cgit v1.2.3 From a011e213f3700233ed2a676f1ef0a74a052d7162 Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Fri, 17 Oct 2014 12:43:34 -0700 Subject: ALSA: pcm: use the same dma mmap codepath both for arm and arm64 This avoids following kernel crash when try to playback on arm64 [ 107.497203] [] snd_pcm_mmap_data_fault+0x90/0xd4 [ 107.503405] [] __do_fault+0xb0/0x498 [ 107.508565] [] handle_mm_fault+0x224/0x7b0 [ 107.514246] [] do_page_fault+0x11c/0x310 [ 107.519738] [] do_mem_abort+0x38/0x98 Tested: backported to 3.14 and tried to playback on arm64 machine Signed-off-by: Anatol Pomozov Cc: Signed-off-by: Takashi Iwai --- sound/core/pcm_native.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index bfe1cf6b492f..815396d8427f 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -3311,7 +3311,7 @@ static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { #ifndef ARCH_HAS_DMA_MMAP_COHERENT /* This should be defined / handled globally! */ -#ifdef CONFIG_ARM +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) #define ARCH_HAS_DMA_MMAP_COHERENT #endif #endif -- cgit v1.2.3 From fadc07522c3ce65c4d1c69a9284605a07aea1be8 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 19 Oct 2014 19:19:57 +0300 Subject: MAINTAINERS: Change Boaz Harrosh's email I have moved on, and do no longer have Panasas email access. Update to an email that can reach me. So change bharrosh@panasas.com => ooo@electrozaur.com Explain of email address: * electrozaur.com is a domain owned by me. * ooo - Stands for Open Osd . Org Another email alias that can be used is: openosd@gmail.com CC: Greg KH Signed-off-by: Boaz Harrosh --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f10ed3914ea8..fc2c9a838d92 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6725,7 +6725,7 @@ S: Orphan F: drivers/net/wireless/orinoco/ OSD LIBRARY and FILESYSTEM -M: Boaz Harrosh +M: Boaz Harrosh M: Benny Halevy L: osd-dev@open-osd.org W: http://open-osd.org -- cgit v1.2.3 From aa281ac631008b9c18c405c8880007789f659c7d Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 19 Oct 2014 19:38:58 +0300 Subject: Boaz Harrosh - Fix broken email address I no longer have access to the Panasas email. So change to an email that can always reach me. Signed-off-by: Boaz Harrosh --- drivers/scsi/osd/Kbuild | 2 +- drivers/scsi/osd/Kconfig | 2 +- drivers/scsi/osd/osd_debug.h | 2 +- drivers/scsi/osd/osd_initiator.c | 4 ++-- drivers/scsi/osd/osd_uld.c | 4 ++-- fs/exofs/Kbuild | 2 +- fs/exofs/common.h | 2 +- fs/exofs/dir.c | 2 +- fs/exofs/exofs.h | 2 +- fs/exofs/file.c | 2 +- fs/exofs/inode.c | 2 +- fs/exofs/namei.c | 2 +- fs/exofs/ore.c | 4 ++-- fs/exofs/ore_raid.c | 2 +- fs/exofs/ore_raid.h | 2 +- fs/exofs/super.c | 2 +- fs/exofs/symlink.c | 2 +- fs/exofs/sys.c | 2 +- fs/nfs/objlayout/objio_osd.c | 2 +- fs/nfs/objlayout/objlayout.c | 2 +- fs/nfs/objlayout/objlayout.h | 2 +- fs/nfs/objlayout/pnfs_osd_xdr_cli.c | 2 +- include/linux/pnfs_osd_xdr.h | 2 +- include/scsi/osd_initiator.h | 2 +- include/scsi/osd_ore.h | 2 +- include/scsi/osd_protocol.h | 4 ++-- include/scsi/osd_sec.h | 2 +- include/scsi/osd_sense.h | 2 +- include/scsi/osd_types.h | 2 +- 29 files changed, 33 insertions(+), 33 deletions(-) diff --git a/drivers/scsi/osd/Kbuild b/drivers/scsi/osd/Kbuild index 5fd73d77c3af..58cecd45b0f5 100644 --- a/drivers/scsi/osd/Kbuild +++ b/drivers/scsi/osd/Kbuild @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh +# Boaz Harrosh # Benny Halevy # # This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/Kconfig b/drivers/scsi/osd/Kconfig index a0703514eb0f..347cc5e33749 100644 --- a/drivers/scsi/osd/Kconfig +++ b/drivers/scsi/osd/Kconfig @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh +# Boaz Harrosh # Benny Halevy # # This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/osd_debug.h b/drivers/scsi/osd/osd_debug.h index 579e491f11df..26341261bb5c 100644 --- a/drivers/scsi/osd/osd_debug.h +++ b/drivers/scsi/osd/osd_debug.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index 5f4cbf0c4759..52e243b77000 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -7,7 +7,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify @@ -57,7 +57,7 @@ enum { OSD_REQ_RETRIES = 1 }; -MODULE_AUTHOR("Boaz Harrosh "); +MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("open-osd initiator library libosd.ko"); MODULE_LICENSE("GPL"); diff --git a/drivers/scsi/osd/osd_uld.c b/drivers/scsi/osd/osd_uld.c index e1d9a4c4c4b3..92cdd4b06526 100644 --- a/drivers/scsi/osd/osd_uld.c +++ b/drivers/scsi/osd/osd_uld.c @@ -10,7 +10,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify @@ -74,7 +74,7 @@ static const char osd_name[] = "osd"; static const char *osd_version_string = "open-osd 0.2.1"; -MODULE_AUTHOR("Boaz Harrosh "); +MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("open-osd Upper-Layer-Driver osd.ko"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CHARDEV_MAJOR(SCSI_OSD_MAJOR); diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 389ba8312d5d..b47c7b8dc275 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh +# Boaz Harrosh # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 diff --git a/fs/exofs/common.h b/fs/exofs/common.h index 3bbd46956d77..7d88ef566213 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -4,7 +4,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 49f51ab4caac..d7defd557601 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index fffe86fd7a42..ad9cac670a47 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 71bf8e4fb5d4..1a376b42d305 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 3f9cafd73931..f1d3d4eb8c4f 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 4731fd991efe..28907460e8fa 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index cfc0205d62c4..7bd8ac8dfb28 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * This file is part of exofs. * @@ -29,7 +29,7 @@ #include "ore_raid.h" -MODULE_AUTHOR("Boaz Harrosh "); +MODULE_AUTHOR("Boaz Harrosh "); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); MODULE_LICENSE("GPL"); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 84529b8a331b..27cbdb697649 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2011 - * Boaz Harrosh + * Boaz Harrosh * * This file is part of the objects raid engine (ore). * diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index cf6375d82129..a6e746775570 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -1,6 +1,6 @@ /* * Copyright (C) from 2011 - * Boaz Harrosh + * Boaz Harrosh * * This file is part of the objects raid engine (ore). * diff --git a/fs/exofs/super.c b/fs/exofs/super.c index ed73ed8ebbee..95965503afcb 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c index 4dd687c3e747..832e2624b80b 100644 --- a/fs/exofs/symlink.c +++ b/fs/exofs/symlink.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh + * Boaz Harrosh * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c index 1b4f2f95fc37..5e6a2c0a1f0b 100644 --- a/fs/exofs/sys.c +++ b/fs/exofs/sys.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2012 * Sachin Bhamare - * Boaz Harrosh + * Boaz Harrosh * * This file is part of exofs. * diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index ae05278b3761..5cbd8cf9a5a5 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 697a16d11fac..ad988428162e 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index fd13f1d2f136..3472e8390ba5 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -6,7 +6,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c index b3918f7ac34d..f093c7ec983b 100644 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/include/linux/pnfs_osd_xdr.h b/include/linux/pnfs_osd_xdr.h index fe25876c1a5d..17d7d0d20eca 100644 --- a/include/linux/pnfs_osd_xdr.h +++ b/include/linux/pnfs_osd_xdr.h @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy - * Boaz Harrosh + * Boaz Harrosh * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/include/scsi/osd_initiator.h b/include/scsi/osd_initiator.h index b2e85fdd2ae0..a09cca829082 100644 --- a/include/scsi/osd_initiator.h +++ b/include/scsi/osd_initiator.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_ore.h b/include/scsi/osd_ore.h index 6ca3265a4dca..7a8d2cd30328 100644 --- a/include/scsi/osd_ore.h +++ b/include/scsi/osd_ore.h @@ -1,6 +1,6 @@ /* * Copyright (C) 2011 - * Boaz Harrosh + * Boaz Harrosh * * Public Declarations of the ORE API * diff --git a/include/scsi/osd_protocol.h b/include/scsi/osd_protocol.h index a2594afe05c7..e0ca835e7bf7 100644 --- a/include/scsi/osd_protocol.h +++ b/include/scsi/osd_protocol.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify @@ -496,7 +496,7 @@ struct osd_timestamp { */ struct osd_key_identifier { - u8 id[7]; /* if you know why 7 please email bharrosh@panasas.com */ + u8 id[7]; /* if you know why 7 please email ooo@electrozaur.com */ } __packed; /* for osd_capability.format */ diff --git a/include/scsi/osd_sec.h b/include/scsi/osd_sec.h index f96151c9c9e8..7abeb0f0db30 100644 --- a/include/scsi/osd_sec.h +++ b/include/scsi/osd_sec.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_sense.h b/include/scsi/osd_sense.h index 91db543a5502..d52aa93a0b2d 100644 --- a/include/scsi/osd_sense.h +++ b/include/scsi/osd_sense.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify diff --git a/include/scsi/osd_types.h b/include/scsi/osd_types.h index bd0be7ed4bcf..48e8a165e136 100644 --- a/include/scsi/osd_types.h +++ b/include/scsi/osd_types.h @@ -4,7 +4,7 @@ * Copyright (C) 2008 Panasas Inc. All rights reserved. * * Authors: - * Boaz Harrosh + * Boaz Harrosh * Benny Halevy * * This program is free software; you can redistribute it and/or modify -- cgit v1.2.3 From 1fa3a002b2546c42c343c77c144871285896ced5 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 19 Oct 2014 20:36:36 +0300 Subject: Boaz Harrosh - fix email in Documentation I forgot to update Documentation/*.txt Signed-off-by: Boaz Harrosh --- Documentation/scsi/osd.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Documentation/scsi/osd.txt b/Documentation/scsi/osd.txt index da162f7fd5f5..5a9879bad073 100644 --- a/Documentation/scsi/osd.txt +++ b/Documentation/scsi/osd.txt @@ -184,8 +184,7 @@ Any problems, questions, bug reports, lonely OSD nights, please email: More up-to-date information can be found on: http://open-osd.org -Boaz Harrosh -Benny Halevy +Boaz Harrosh References ========== -- cgit v1.2.3 From 3d5960472de0a245c56ba55e9a59b8d0f4c69cb4 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 16 Oct 2014 15:07:51 +0200 Subject: hwmon: (menf21bmc) Include linux/err.h Include linux/err.h to get the definitions for IS_ERR() PTR_ERR() and ERR_PTR() used in the driver. This fixes compilation on powerpc targets. Signed-off-by: Johannes Thumshirn Signed-off-by: Guenter Roeck --- drivers/hwmon/menf21bmc_hwmon.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hwmon/menf21bmc_hwmon.c b/drivers/hwmon/menf21bmc_hwmon.c index c92229d321c9..afc6b58eaa62 100644 --- a/drivers/hwmon/menf21bmc_hwmon.c +++ b/drivers/hwmon/menf21bmc_hwmon.c @@ -21,6 +21,7 @@ #include #include #include +#include #define DRV_NAME "menf21bmc_hwmon" -- cgit v1.2.3 From b450b17c156e264bc44a198046d3ebaaef5a041d Mon Sep 17 00:00:00 2001 From: Harsha Priya Date: Thu, 9 Oct 2014 11:04:56 +0000 Subject: ALSA: ALC283 codec - Avoid pop noise on headphones during suspend/resume This patch sets the headphones mode to default before suspending which helps avoid the pop noise on headphones Signed-off-by: Harsha Priya Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index bc86c36b4bfa..4c35490cf119 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -2884,6 +2884,9 @@ static void alc283_shutup(struct hda_codec *codec) alc_write_coef_idx(codec, 0x43, 0x9004); + /*depop hp during suspend*/ + alc_write_coef_idx(codec, 0x06, 0x2100); + snd_hda_codec_write(codec, hp_pin, 0, AC_VERB_SET_AMP_GAIN_MUTE, AMP_OUT_MUTE); -- cgit v1.2.3 From f0b127fbfdc8756eba7437ab668f3169280bd358 Mon Sep 17 00:00:00 2001 From: Vlad Catoi Date: Sat, 18 Oct 2014 17:45:41 -0500 Subject: ALSA: usb-audio: Add support for Steinberg UR22 USB interface Adding support for Steinberg UR22 USB interface via quirks table patch See Ubuntu bug report: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1317244 Also see threads: http://linux-audio.4202.n7.nabble.com/Support-for-Steinberg-UR22-Yamaha-USB-chipset-0499-1509-tc82888.html#a82917 http://www.steinberg.net/forums/viewtopic.php?t=62290 Tested by at least 4 people judging by the threads. Did not test MIDI interface, but audio output and capture both are functional. Built 3.17 kernel with this driver on Ubuntu 14.04 & tested with mpg123 Patch applied to 3.13 Ubuntu kernel works well enough for daily use. Signed-off-by: Vlad Catoi Acked-by: Clemens Ladisch Cc: Signed-off-by: Takashi Iwai --- sound/usb/quirks-table.h | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/sound/usb/quirks-table.h b/sound/usb/quirks-table.h index 223c47b33ba3..c657752a420c 100644 --- a/sound/usb/quirks-table.h +++ b/sound/usb/quirks-table.h @@ -384,6 +384,36 @@ YAMAHA_DEVICE(0x105d, NULL), } } }, +{ + USB_DEVICE(0x0499, 0x1509), + .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { + /* .vendor_name = "Yamaha", */ + /* .product_name = "Steinberg UR22", */ + .ifnum = QUIRK_ANY_INTERFACE, + .type = QUIRK_COMPOSITE, + .data = (const struct snd_usb_audio_quirk[]) { + { + .ifnum = 1, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + { + .ifnum = 2, + .type = QUIRK_AUDIO_STANDARD_INTERFACE + }, + { + .ifnum = 3, + .type = QUIRK_MIDI_YAMAHA + }, + { + .ifnum = 4, + .type = QUIRK_IGNORE_INTERFACE + }, + { + .ifnum = -1 + } + } + } +}, { USB_DEVICE(0x0499, 0x150a), .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) { -- cgit v1.2.3 From 6acce400d9daf1353fbf497302670c90a3205e1d Mon Sep 17 00:00:00 2001 From: Anssi Hannula Date: Sun, 19 Oct 2014 19:25:19 +0300 Subject: ALSA: hda - hdmi: Fix missing ELD change event on plug/unplug The ELD ALSA control change event is sent by hdmi_present_sense() when eld_changed is true. Currently, it is only true when the ELD buffer contents have been modified. However, the user-visible ELD controls also change to a zero-length value and back when eld_valid is unset/set, and no event is currently sent in such cases (such as when unplugging or replugging a sink). Fix the code to always set eld_changed if eld_valid value is changed, and therefore to always send the change event when the user-visible value changes. Signed-off-by: Anssi Hannula Cc: David Henningsson Cc: # 3.9+ Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_hdmi.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/sound/pci/hda/patch_hdmi.c b/sound/pci/hda/patch_hdmi.c index 39862e98551c..9dc9cf8c90e9 100644 --- a/sound/pci/hda/patch_hdmi.c +++ b/sound/pci/hda/patch_hdmi.c @@ -1583,19 +1583,22 @@ static bool hdmi_present_sense(struct hdmi_spec_per_pin *per_pin, int repoll) } } - if (pin_eld->eld_valid && !eld->eld_valid) { - update_eld = true; + if (pin_eld->eld_valid != eld->eld_valid) eld_changed = true; - } + + if (pin_eld->eld_valid && !eld->eld_valid) + update_eld = true; + if (update_eld) { bool old_eld_valid = pin_eld->eld_valid; pin_eld->eld_valid = eld->eld_valid; - eld_changed = pin_eld->eld_size != eld->eld_size || + if (pin_eld->eld_size != eld->eld_size || memcmp(pin_eld->eld_buffer, eld->eld_buffer, - eld->eld_size) != 0; - if (eld_changed) + eld->eld_size) != 0) { memcpy(pin_eld->eld_buffer, eld->eld_buffer, eld->eld_size); + eld_changed = true; + } pin_eld->eld_size = eld->eld_size; pin_eld->info = eld->info; -- cgit v1.2.3 From 40ac948e02e01e98474330f259ff842844096541 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 15 Oct 2014 13:42:29 +1000 Subject: drm/gt215/gr: fix initialisation on gddr5 boards The binary driver modifies the default context to have this value, rather than 0x3d0040, *after* it's filled the buffer with the usual golden data. We don't really have anything in place to locate the correct offset to do these type of modifications outside of the generation function, so this will have to do. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/core/engine/graph/ctxnv50.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/core/engine/graph/ctxnv50.c b/drivers/gpu/drm/nouveau/core/engine/graph/ctxnv50.c index 552fdbd45ebe..1d0e33fb5f61 100644 --- a/drivers/gpu/drm/nouveau/core/engine/graph/ctxnv50.c +++ b/drivers/gpu/drm/nouveau/core/engine/graph/ctxnv50.c @@ -113,6 +113,8 @@ #define IS_NVA3F(x) (((x) > 0xa0 && (x) < 0xaa) || (x) == 0xaf) #define IS_NVAAF(x) ((x) >= 0xaa && (x) <= 0xac) +#include + /* * This code deals with PGRAPH contexts on NV50 family cards. Like NV40, it's * the GPU itself that does context-switching, but it needs a special @@ -569,8 +571,12 @@ nv50_graph_construct_mmio(struct nouveau_grctx *ctx) gr_def(ctx, 0x407d08, 0x00010040); else if (device->chipset < 0xa0) gr_def(ctx, 0x407d08, 0x00390040); - else - gr_def(ctx, 0x407d08, 0x003d0040); + else { + if (nouveau_fb(device)->ram->type != NV_MEM_TYPE_GDDR5) + gr_def(ctx, 0x407d08, 0x003d0040); + else + gr_def(ctx, 0x407d08, 0x003c0040); + } gr_def(ctx, 0x407d0c, 0x00000022); } -- cgit v1.2.3 From 67e26e41ff8aa514826dae79f0b10169b5ba93b4 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Mon, 20 Oct 2014 15:49:33 +1000 Subject: drm/nouveau: fix regression on agp boards Extends the fix in f2f9a2cbaf019481feefe231f996d3602612fa99 to also workaround permission issues noticed by people using AGP systems. Cc: stable@vger.kernel.org # 3.16: f2f9a2c: drm/nouveau: fix regression Cc: stable@vger.kernel.org # 3.16+ Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nouveau_chan.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c index 977fb8f15d97..77c81d6b45ee 100644 --- a/drivers/gpu/drm/nouveau/nouveau_chan.c +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c @@ -395,15 +395,20 @@ nouveau_channel_new(struct nouveau_drm *drm, struct nvif_device *device, struct nouveau_channel **pchan) { struct nouveau_cli *cli = (void *)nvif_client(&device->base); + bool super; int ret; + /* hack until fencenv50 is fixed, and agp access relaxed */ + super = cli->base.super; + cli->base.super = true; + ret = nouveau_channel_ind(drm, device, handle, arg0, pchan); if (ret) { NV_PRINTK(debug, cli, "ib channel create, %d\n", ret); ret = nouveau_channel_dma(drm, device, handle, pchan); if (ret) { NV_PRINTK(debug, cli, "dma channel create, %d\n", ret); - return ret; + goto done; } } @@ -411,8 +416,9 @@ nouveau_channel_new(struct nouveau_drm *drm, struct nvif_device *device, if (ret) { NV_PRINTK(error, cli, "channel failed to initialise, %d\n", ret); nouveau_channel_del(pchan); - return ret; } - return 0; +done: + cli->base.super = super; + return ret; } -- cgit v1.2.3 From dec02f98ae2e341a2e0bb25f27e84867e5f9f64a Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Sat, 4 Oct 2014 17:48:42 +0800 Subject: pwm: Let PWM_CLPS711X depend on HAS_IOMEM PWM_CLPS711X needs HAS_IOMEM, so depend on it, the related error (with allmodconfig under um): MODPOST 1205 modules ERROR: "devm_ioremap_resource" [drivers/pwm/pwm-clps711x.ko] undefined! ERROR: "devm_ioremap" [drivers/net/phy/mdio-bcm-unimac.ko] undefined! Signed-off-by: Chen Gang Signed-off-by: Thierry Reding --- drivers/pwm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig index 3865dfb9ed08..ef2dd2e4754b 100644 --- a/drivers/pwm/Kconfig +++ b/drivers/pwm/Kconfig @@ -83,6 +83,7 @@ config PWM_BFIN config PWM_CLPS711X tristate "CLPS711X PWM support" depends on ARCH_CLPS711X || COMPILE_TEST + depends on HAS_IOMEM help Generic PWM framework driver for Cirrus Logic CLPS711X. -- cgit v1.2.3 From b1974f965a506c131b60cd3e483340884e831920 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 20 Oct 2014 11:26:57 +0200 Subject: ALSA: hda - Fix inverted LED gpio setup for Lenovo Ideapad We implemented in a wrong way for mute LED on Lenovo Ideapad; the bit must be flipped. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=16373 Fixes: 3e887f379d8a ('ALSA: hda - Add mute LED support to Lenovo Ideapad') Cc: # 3.15+ Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4c35490cf119..34b7bdb510c7 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -5613,9 +5613,9 @@ static void alc662_led_gpio1_mute_hook(void *private_data, int enabled) unsigned int oldval = spec->gpio_led; if (enabled) - spec->gpio_led &= ~0x01; - else spec->gpio_led |= 0x01; + else + spec->gpio_led &= ~0x01; if (spec->gpio_led != oldval) snd_hda_codec_write(codec, 0x01, 0, AC_VERB_SET_GPIO_DATA, spec->gpio_led); -- cgit v1.2.3 From 04f905a9569ca2b6964a35563f135fabbb2470bc Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 10 Oct 2014 11:14:30 +0100 Subject: arm64: Allow 48-bits VA space without ARM_SMMU Now when KVM has been reworked to support 48-bits host VA space, we can allow systems to be configured with this option. However, the ARM SMMU driver also needs to be tweaked for 48-bit support so only allow the config option to be set when not including support for theSMMU. Signed-off-by: Christoffer Dall Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ac9afde76dea..b8053be1d803 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -232,7 +232,7 @@ config ARM64_VA_BITS_42 config ARM64_VA_BITS_48 bool "48-bit" - depends on BROKEN + depends on !ARM_SMMU endchoice -- cgit v1.2.3 From 2a0b5c0d19298cad54573682321b5fe015fa5a0d Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 10 Oct 2014 15:37:28 +0100 Subject: arm64: Align less than PAGE_SIZE pgds naturally When the pgd size is smaller than PAGE_SIZE, pgd_alloc() uses kzalloc() to save space. However, this is not always naturally aligned as required by the architecture. This patch creates a kmem_cache for pgd allocations with the correct alignment. The current kernel configurations with 4K pages + 39-bit VA and 64K pages + 42-bit VA use a full page for the pgd and are not affected. The patch is required for 48-bit VA with 64K pages where the pgd is 512 bytes. Reported-by: Christoffer Dall Reviewed-by: Christoffer Dall Signed-off-by: Catalin Marinas --- arch/arm64/mm/pgd.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 62c6101df260..6682b361d3ac 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -30,12 +30,14 @@ #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) +static struct kmem_cache *pgd_cache; + pgd_t *pgd_alloc(struct mm_struct *mm) { if (PGD_SIZE == PAGE_SIZE) return (pgd_t *)get_zeroed_page(GFP_KERNEL); else - return kzalloc(PGD_SIZE, GFP_KERNEL); + return kmem_cache_zalloc(pgd_cache, GFP_KERNEL); } void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -43,5 +45,17 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) if (PGD_SIZE == PAGE_SIZE) free_page((unsigned long)pgd); else - kfree(pgd); + kmem_cache_free(pgd_cache, pgd); +} + +static int __init pgd_cache_init(void) +{ + /* + * Naturally aligned pgds required by the architecture. + */ + if (PGD_SIZE != PAGE_SIZE) + pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_SIZE, + SLAB_PANIC, NULL); + return 0; } +core_initcall(pgd_cache_init); -- cgit v1.2.3 From 971a5b6fe634bb7b617d8c5f25b6a3ddbc600194 Mon Sep 17 00:00:00 2001 From: Victor Kamensky Date: Tue, 14 Oct 2014 06:55:05 +0100 Subject: arm64: compat: fix compat types affecting struct compat_elf_prpsinfo The compat_elf_prpsinfo structure does not match the arch/arm struct elf_pspsinfo definition. As result NT_PRPSINFO note in core file created by arm64 kernel for aarch32 (compat) process has wrong size. So gdb cannot display command that caused process crash. Fix is to change size of __compat_uid_t, __compat_gid_t so it would match size of similar fields in arch/arm case. Signed-off-by: Victor Kamensky Acked-by: Arnd Bergmann Cc: Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/compat.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/include/asm/compat.h b/arch/arm64/include/asm/compat.h index 253e33bc94fb..56de5aadede2 100644 --- a/arch/arm64/include/asm/compat.h +++ b/arch/arm64/include/asm/compat.h @@ -37,8 +37,8 @@ typedef s32 compat_ssize_t; typedef s32 compat_time_t; typedef s32 compat_clock_t; typedef s32 compat_pid_t; -typedef u32 __compat_uid_t; -typedef u32 __compat_gid_t; +typedef u16 __compat_uid_t; +typedef u16 __compat_gid_t; typedef u16 __compat_uid16_t; typedef u16 __compat_gid16_t; typedef u32 __compat_uid32_t; -- cgit v1.2.3 From c0260ba906c4dfbcccd6414c3e2c0e73a7d7e35a Mon Sep 17 00:00:00 2001 From: Steve Capper Date: Fri, 17 Oct 2014 15:27:38 +0100 Subject: arm64: mm: Correct fixmap pagetable types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling with STRICT_MM_TYPECHECKS gives the following arch/arm64/mm/ioremap.c: In function ‘early_ioremap_init’: arch/arm64/mm/ioremap.c:152:2: warning: passing argument 3 of ‘pud_populate’ from incompatible pointer type pud_populate(&init_mm, pud, bm_pmd); The data types for bm_pmd and bm_pud are incorrectly set to pte_t. This patch corrects these types. Signed-off-by: Steve Capper Signed-off-by: Catalin Marinas --- arch/arm64/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index fa324bd5a5c4..4a07630a6616 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -105,10 +105,10 @@ EXPORT_SYMBOL(ioremap_cache); static pte_t bm_pte[PTRS_PER_PTE] __page_aligned_bss; #if CONFIG_ARM64_PGTABLE_LEVELS > 2 -static pte_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss; +static pmd_t bm_pmd[PTRS_PER_PMD] __page_aligned_bss; #endif #if CONFIG_ARM64_PGTABLE_LEVELS > 3 -static pte_t bm_pud[PTRS_PER_PUD] __page_aligned_bss; +static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss; #endif static inline pud_t * __init early_ioremap_pud(unsigned long addr) -- cgit v1.2.3 From b569c1c622c5e60c960a6ae5bd0880e0cdbd56b1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 16 Sep 2014 08:48:50 +0100 Subject: net: bpf: arm64: address randomize and write protect JIT code This is the ARM64 variant for 314beb9bcab ("x86: bpf_jit_comp: secure bpf jit against spraying attacks"). Thanks to commit 11d91a770f1f ("arm64: Add CONFIG_DEBUG_SET_MODULE_RONX support") which added necessary infrastructure, we can now implement RO marking of eBPF generated JIT image pages and randomize start offset for the JIT code, so that it does not reside directly on a page boundary anymore. Likewise, the holes are filled with illegal instructions: here we use BRK #0x100 (opcode 0xd4202000) to trigger a fault in the kernel (unallocated BRKs would trigger a fault through do_debug_exception). This seems more reliable as we don't have a guaranteed undefined instruction space on ARM64. This is basically the ARM64 variant of what we already have in ARM via commit 55309dd3d4cd ("net: bpf: arm: address randomize and write protect JIT code"). Moreover, this commit also presents a merge resolution due to conflicts with commit 60a3b2253c41 ("net: bpf: make eBPF interpreter images read-only") as we don't use kfree() in bpf_jit_free() anymore to release the locked bpf_prog structure, but instead bpf_prog_unlock_free() through a different allocator. JIT tested on aarch64 with BPF test suite. Reference: http://mainisusuallyafunction.blogspot.com/2012/11/attacking-hardened-linux-systems-with.html Signed-off-by: Daniel Borkmann Reviewed-by: Zi Shen Lim Acked-by: Will Deacon Cc: David S. Miller Cc: Alexei Starovoitov Signed-off-by: Catalin Marinas --- arch/arm64/net/bpf_jit_comp.c | 39 ++++++++++++++++++++++++++++++--------- 1 file changed, 30 insertions(+), 9 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 7ae33545535b..71088952ed27 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -19,12 +19,13 @@ #define pr_fmt(fmt) "bpf_jit: " fmt #include -#include #include #include #include + #include #include +#include #include "bpf_jit.h" @@ -119,6 +120,14 @@ static inline int bpf2a64_offset(int bpf_to, int bpf_from, return to - from; } +static void jit_fill_hole(void *area, unsigned int size) +{ + u32 *ptr; + /* We are guaranteed to have aligned memory. */ + for (ptr = area; size >= sizeof(u32); size -= sizeof(u32)) + *ptr++ = cpu_to_le32(AARCH64_BREAK_FAULT); +} + static inline int epilogue_offset(const struct jit_ctx *ctx) { int to = ctx->offset[ctx->prog->len - 1]; @@ -613,8 +622,10 @@ void bpf_jit_compile(struct bpf_prog *prog) void bpf_int_jit_compile(struct bpf_prog *prog) { + struct bpf_binary_header *header; struct jit_ctx ctx; int image_size; + u8 *image_ptr; if (!bpf_jit_enable) return; @@ -636,23 +647,25 @@ void bpf_int_jit_compile(struct bpf_prog *prog) goto out; build_prologue(&ctx); - build_epilogue(&ctx); /* Now we know the actual image size. */ image_size = sizeof(u32) * ctx.idx; - ctx.image = module_alloc(image_size); - if (unlikely(ctx.image == NULL)) + header = bpf_jit_binary_alloc(image_size, &image_ptr, + sizeof(u32), jit_fill_hole); + if (header == NULL) goto out; /* 2. Now, the actual pass. */ + ctx.image = (u32 *)image_ptr; ctx.idx = 0; + build_prologue(&ctx); ctx.body_offset = ctx.idx; if (build_body(&ctx)) { - module_free(NULL, ctx.image); + bpf_jit_binary_free(header); goto out; } @@ -663,17 +676,25 @@ void bpf_int_jit_compile(struct bpf_prog *prog) bpf_jit_dump(prog->len, image_size, 2, ctx.image); bpf_flush_icache(ctx.image, ctx.image + ctx.idx); + + set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)ctx.image; prog->jited = 1; - out: kfree(ctx.offset); } void bpf_jit_free(struct bpf_prog *prog) { - if (prog->jited) - module_free(NULL, prog->bpf_func); + unsigned long addr = (unsigned long)prog->bpf_func & PAGE_MASK; + struct bpf_binary_header *header = (void *)addr; + + if (!prog->jited) + goto free_filter; + + set_memory_rw(addr, header->pages); + bpf_jit_binary_free(header); - kfree(prog); +free_filter: + bpf_prog_unlock_free(prog); } -- cgit v1.2.3 From d65a634a0acefd6b6e8718e2399b6771ccb17b24 Mon Sep 17 00:00:00 2001 From: Zi Shen Lim Date: Tue, 16 Sep 2014 19:37:35 +0100 Subject: arm64: bpf: add 'shift by register' instructions Commit 72b603ee8cfc ("bpf: x86: add missing 'shift by register' instructions to x64 eBPF JIT") noted support for 'shift by register' in eBPF and added support for it for x64. Let's enable this for arm64 as well. The arm64 eBPF JIT compiler now passes the new 'shift by register' test case introduced in the same commit 72b603ee8cfc. Signed-off-by: Zi Shen Lim Cc: Will Deacon Cc: David S. Miller Cc: Alexei Starovoitov Signed-off-by: Catalin Marinas --- arch/arm64/net/bpf_jit.h | 8 ++++++-- arch/arm64/net/bpf_jit_comp.c | 12 ++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h index 2134f7e6c288..de0a81a539a0 100644 --- a/arch/arm64/net/bpf_jit.h +++ b/arch/arm64/net/bpf_jit.h @@ -144,8 +144,12 @@ /* Data-processing (2 source) */ /* Rd = Rn OP Rm */ -#define A64_UDIV(sf, Rd, Rn, Rm) aarch64_insn_gen_data2(Rd, Rn, Rm, \ - A64_VARIANT(sf), AARCH64_INSN_DATA2_UDIV) +#define A64_DATA2(sf, Rd, Rn, Rm, type) aarch64_insn_gen_data2(Rd, Rn, Rm, \ + A64_VARIANT(sf), AARCH64_INSN_DATA2_##type) +#define A64_UDIV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, UDIV) +#define A64_LSLV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, LSLV) +#define A64_LSRV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, LSRV) +#define A64_ASRV(sf, Rd, Rn, Rm) A64_DATA2(sf, Rd, Rn, Rm, ASRV) /* Data-processing (3 source) */ /* Rd = Ra + Rn * Rm */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 71088952ed27..80cc76972798 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -261,6 +261,18 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) emit(A64_MUL(is64, tmp, tmp, src), ctx); emit(A64_SUB(is64, dst, dst, tmp), ctx); break; + case BPF_ALU | BPF_LSH | BPF_X: + case BPF_ALU64 | BPF_LSH | BPF_X: + emit(A64_LSLV(is64, dst, dst, src), ctx); + break; + case BPF_ALU | BPF_RSH | BPF_X: + case BPF_ALU64 | BPF_RSH | BPF_X: + emit(A64_LSRV(is64, dst, dst, src), ctx); + break; + case BPF_ALU | BPF_ARSH | BPF_X: + case BPF_ALU64 | BPF_ARSH | BPF_X: + emit(A64_ASRV(is64, dst, dst, src), ctx); + break; /* dst = -dst */ case BPF_ALU | BPF_NEG: case BPF_ALU64 | BPF_NEG: -- cgit v1.2.3 From 30d3d94cc3d507a23dcb09e5c365464e4aa9f580 Mon Sep 17 00:00:00 2001 From: Zi Shen Lim Date: Tue, 16 Sep 2014 21:29:23 +0100 Subject: arm64: bpf: add 'load 64-bit immediate' instruction Commit 02ab695bb37e (net: filter: add "load 64-bit immediate" eBPF instruction) introduced a new eBPF instruction. Let's add support for this for arm64 as well. Our arm64 eBPF JIT compiler now passes the new "load 64-bit immediate" test case introduced in the same commit 02ab695bb37e. Signed-off-by: Zi Shen Lim Cc: Will Deacon Cc: David S. Miller Cc: Alexei Starovoitov Signed-off-by: Catalin Marinas --- arch/arm64/net/bpf_jit_comp.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 80cc76972798..618d2cdc2f1b 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -205,6 +205,12 @@ static void build_epilogue(struct jit_ctx *ctx) emit(A64_RET(A64_LR), ctx); } +/* JITs an eBPF instruction. + * Returns: + * 0 - successfully JITed an 8-byte eBPF instruction. + * >0 - successfully JITed a 16-byte eBPF instruction. + * <0 - failed to JIT. + */ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) { const u8 code = insn->code; @@ -464,6 +470,27 @@ emit_cond_jmp: emit(A64_B(jmp_offset), ctx); break; + /* dst = imm64 */ + case BPF_LD | BPF_IMM | BPF_DW: + { + const struct bpf_insn insn1 = insn[1]; + u64 imm64; + + if (insn1.code != 0 || insn1.src_reg != 0 || + insn1.dst_reg != 0 || insn1.off != 0) { + /* Note: verifier in BPF core must catch invalid + * instructions. + */ + pr_err_once("Invalid BPF_LD_IMM64 instruction\n"); + return -EINVAL; + } + + imm64 = (u64)insn1.imm << 32 | imm; + emit_a64_mov_i64(dst, imm64, ctx); + + return 1; + } + /* LDX: dst = *(size *)(src + off) */ case BPF_LDX | BPF_MEM | BPF_W: case BPF_LDX | BPF_MEM | BPF_H: @@ -615,6 +642,10 @@ static int build_body(struct jit_ctx *ctx) ctx->offset[i] = ctx->idx; ret = build_insn(insn, ctx); + if (ret > 0) { + i++; + continue; + } if (ret) return ret; } -- cgit v1.2.3 From 74c3deacb2a903bcb9d5a20dc885159142245010 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 11 Oct 2014 10:00:43 +0100 Subject: net: bpf: arm64: minor fix of type in jited Commit 286aad3c4014 ("net: bpf: be friendly to kmemcheck") changed the type of jited from a bitfield into a bool. As this commmit wasn't available at the time when arm64 eBPF JIT was merged, fix it up now as net is merged into mainline. Signed-off-by: Daniel Borkmann Cc: Zi Shen Lim Signed-off-by: Catalin Marinas --- arch/arm64/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 618d2cdc2f1b..41f1e3e2ea24 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -722,7 +722,7 @@ void bpf_int_jit_compile(struct bpf_prog *prog) set_memory_ro((unsigned long)header, header->pages); prog->bpf_func = (void *)ctx.image; - prog->jited = 1; + prog->jited = true; out: kfree(ctx.offset); } -- cgit v1.2.3 From a24637d5ddc215838776e755970bb199df00a1a5 Mon Sep 17 00:00:00 2001 From: Alex Bennée Date: Tue, 22 Jul 2014 16:14:42 +0100 Subject: Documentation/arm64/memory.txt: fix typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is no swapper_pgd_dir, it meant swapper_pg_dir. Signed-off-by: Alex Bennée Signed-off-by: Catalin Marinas --- Documentation/arm64/memory.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/arm64/memory.txt b/Documentation/arm64/memory.txt index 344e85cc7323..d7273a5f6456 100644 --- a/Documentation/arm64/memory.txt +++ b/Documentation/arm64/memory.txt @@ -17,7 +17,7 @@ User addresses have bits 63:48 set to 0 while the kernel addresses have the same bits set to 1. TTBRx selection is given by bit 63 of the virtual address. The swapper_pg_dir contains only kernel (global) mappings while the user pgd contains only user (non-global) mappings. -The swapper_pgd_dir address is written to TTBR1 and never written to +The swapper_pg_dir address is written to TTBR1 and never written to TTBR0. -- cgit v1.2.3 From fca8c0481bc8d751479ca13f454e89a7fdfece03 Mon Sep 17 00:00:00 2001 From: Michal Simek Date: Wed, 13 Aug 2014 13:51:28 +0200 Subject: watchdog: xilinx: Remove .owner field for driver There is no need to init .owner field. Based on the patch from Peter Griffin "mmc: remove .owner field for drivers using module_platform_driver" This patch removes the superflous .owner field for drivers which use the module_platform_driver API, as this is overriden in platform_driver_register anyway." Signed-off-by: Michal Simek Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/of_xilinx_wdt.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/watchdog/of_xilinx_wdt.c b/drivers/watchdog/of_xilinx_wdt.c index 1e6e28df5d7b..b2e1b4cbbdc1 100644 --- a/drivers/watchdog/of_xilinx_wdt.c +++ b/drivers/watchdog/of_xilinx_wdt.c @@ -236,7 +236,6 @@ static struct platform_driver xwdt_driver = { .probe = xwdt_probe, .remove = xwdt_remove, .driver = { - .owner = THIS_MODULE, .name = WATCHDOG_NAME, .of_match_table = xwdt_of_match, }, -- cgit v1.2.3 From 62ce25439a7ea01eba5c2a6a8284e4aa23890042 Mon Sep 17 00:00:00 2001 From: Pranith Kumar Date: Wed, 20 Aug 2014 15:26:46 -0400 Subject: powerpc: booke_wdt: Fix build error as a module Building booke_wdt fails when trying to build as a module as there is no early_param() in module. Fix by using module_param() instead of early_param(). Signed-off-by: Pranith Kumar Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/booke_wdt.c | 28 +++++----------------------- 1 file changed, 5 insertions(+), 23 deletions(-) diff --git a/drivers/watchdog/booke_wdt.c b/drivers/watchdog/booke_wdt.c index 08a785398eac..e96b09b135c8 100644 --- a/drivers/watchdog/booke_wdt.c +++ b/drivers/watchdog/booke_wdt.c @@ -30,8 +30,6 @@ * occur, and the final time the board will reset. */ -u32 booke_wdt_enabled; -u32 booke_wdt_period = CONFIG_BOOKE_WDT_DEFAULT_TIMEOUT; #ifdef CONFIG_PPC_FSL_BOOK3E #define WDTP(x) ((((x)&0x3)<<30)|(((x)&0x3c)<<15)) @@ -41,27 +39,10 @@ u32 booke_wdt_period = CONFIG_BOOKE_WDT_DEFAULT_TIMEOUT; #define WDTP_MASK (TCR_WP_MASK) #endif -/* Checks wdt=x and wdt_period=xx command-line option */ -notrace int __init early_parse_wdt(char *p) -{ - if (p && strncmp(p, "0", 1) != 0) - booke_wdt_enabled = 1; - - return 0; -} -early_param("wdt", early_parse_wdt); - -int __init early_parse_wdt_period(char *p) -{ - unsigned long ret; - if (p) { - if (!kstrtol(p, 0, &ret)) - booke_wdt_period = ret; - } - - return 0; -} -early_param("wdt_period", early_parse_wdt_period); +static bool booke_wdt_enabled; +module_param(booke_wdt_enabled, bool, 0); +static int booke_wdt_period = CONFIG_BOOKE_WDT_DEFAULT_TIMEOUT; +module_param(booke_wdt_period, int, 0); #ifdef CONFIG_PPC_FSL_BOOK3E @@ -259,5 +240,6 @@ static int __init booke_wdt_init(void) module_init(booke_wdt_init); module_exit(booke_wdt_exit); +MODULE_ALIAS("booke_wdt"); MODULE_DESCRIPTION("PowerPC Book-E watchdog driver"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 1f897a81915222310374cac1a85c0c7104f16249 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Tue, 19 Aug 2014 14:57:12 +0300 Subject: watchdog: ts72xx_wdt: Kill superfluous variable in remove There is no need to store the return value of misc_deregister() in a variable. Instead we can just return the value directly. Signed-off-by: Mika Westerberg Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/ts72xx_wdt.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/watchdog/ts72xx_wdt.c b/drivers/watchdog/ts72xx_wdt.c index afa9d6ef353a..dee9c6cbe6df 100644 --- a/drivers/watchdog/ts72xx_wdt.c +++ b/drivers/watchdog/ts72xx_wdt.c @@ -428,11 +428,7 @@ static int ts72xx_wdt_probe(struct platform_device *pdev) static int ts72xx_wdt_remove(struct platform_device *pdev) { - int error; - - error = misc_deregister(&ts72xx_wdt_miscdev); - - return error; + return misc_deregister(&ts72xx_wdt_miscdev); } static struct platform_driver ts72xx_wdt_driver = { -- cgit v1.2.3 From 0461aea7ec379b00f4acb5d612bfb2f7a497eb92 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Mon, 18 Aug 2014 16:12:50 +0800 Subject: watchdog: imx2_wdt: Convert to use regmap framework's endianness method. Signed-off-by: Xiubo Li Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt | 3 ++- drivers/watchdog/imx2_wdt.c | 6 ------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt b/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt index e52ba2da868c..8dab6fd024aa 100644 --- a/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/fsl-imx-wdt.txt @@ -7,7 +7,8 @@ Required properties: Optional property: - big-endian: If present the watchdog device's registers are implemented - in big endian mode, otherwise in little mode. + in big endian mode, otherwise in native mode(same with CPU), for more + detail please see: Documentation/devicetree/bindings/regmap/regmap.txt. Examples: diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c index 68c3d379ffa8..f37bb05e7ec0 100644 --- a/drivers/watchdog/imx2_wdt.c +++ b/drivers/watchdog/imx2_wdt.c @@ -191,12 +191,10 @@ static struct regmap_config imx2_wdt_regmap_config = { static int __init imx2_wdt_probe(struct platform_device *pdev) { - struct device_node *np = pdev->dev.of_node; struct imx2_wdt_device *wdev; struct watchdog_device *wdog; struct resource *res; void __iomem *base; - bool big_endian; int ret; u32 val; @@ -204,10 +202,6 @@ static int __init imx2_wdt_probe(struct platform_device *pdev) if (!wdev) return -ENOMEM; - big_endian = of_property_read_bool(np, "big-endian"); - if (big_endian) - imx2_wdt_regmap_config.val_format_endian = REGMAP_ENDIAN_BIG; - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(base)) -- cgit v1.2.3 From 4846e3784585173f48e267b76f968bcb4a12d3b2 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Tue, 9 Sep 2014 22:18:31 +0200 Subject: watchdog: simplify definitions of WATCHDOG_NOWAYOUT(_INIT_STATUS)? Signed-off-by: Uwe Kleine-K=C3=B6nig Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- include/linux/watchdog.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 2a3038ee17a3..395b70e0eccf 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -97,13 +97,8 @@ struct watchdog_device { #define WDOG_UNREGISTERED 4 /* Has the device been unregistered */ }; -#ifdef CONFIG_WATCHDOG_NOWAYOUT -#define WATCHDOG_NOWAYOUT 1 -#define WATCHDOG_NOWAYOUT_INIT_STATUS (1 << WDOG_NO_WAY_OUT) -#else -#define WATCHDOG_NOWAYOUT 0 -#define WATCHDOG_NOWAYOUT_INIT_STATUS 0 -#endif +#define WATCHDOG_NOWAYOUT IS_BUILTIN(CONFIG_WATCHDOG_NOWAYOUT) +#define WATCHDOG_NOWAYOUT_INIT_STATUS (WATCHDOG_NOWAYOUT << WDOG_NO_WAY_OUT) /* Use the following function to check whether or not the watchdog is active */ static inline bool watchdog_active(struct watchdog_device *wdd) -- cgit v1.2.3 From 58bf016426594e5370e7e7059698a278294db997 Mon Sep 17 00:00:00 2001 From: Harini Katakam Date: Fri, 22 Aug 2014 14:58:01 +0530 Subject: watchdog: Add Cadence WDT driver Add Cadence WDT driver. This is used by Xilinx Zynq. Signed-off-by: Harini Katakam Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 8 + drivers/watchdog/Makefile | 1 + drivers/watchdog/cadence_wdt.c | 516 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 525 insertions(+) create mode 100644 drivers/watchdog/cadence_wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index e3d5bf0a5021..786f8c8338fb 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -157,6 +157,14 @@ config AT91SAM9X_WATCHDOG Watchdog timer embedded into AT91SAM9X and AT91CAP9 chips. This will reboot your system when the timeout is reached. +config CADENCE_WATCHDOG + tristate "Cadence Watchdog Timer" + depends on ARM + select WATCHDOG_CORE + help + Say Y here if you want to include support for the watchdog + timer in the Xilinx Zynq. + config 21285_WATCHDOG tristate "DC21285 watchdog" depends on FOOTBRIDGE diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index de1701470c14..82af9956f96a 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_USBPCWATCHDOG) += pcwd_usb.o obj-$(CONFIG_ARM_SP805_WATCHDOG) += sp805_wdt.o obj-$(CONFIG_AT91RM9200_WATCHDOG) += at91rm9200_wdt.o obj-$(CONFIG_AT91SAM9X_WATCHDOG) += at91sam9_wdt.o +obj-$(CONFIG_CADENCE_WATCHDOG) += cadence_wdt.o obj-$(CONFIG_OMAP_WATCHDOG) += omap_wdt.o obj-$(CONFIG_TWL4030_WATCHDOG) += twl4030_wdt.o obj-$(CONFIG_21285_WATCHDOG) += wdt285.o diff --git a/drivers/watchdog/cadence_wdt.c b/drivers/watchdog/cadence_wdt.c new file mode 100644 index 000000000000..5927c0a98a74 --- /dev/null +++ b/drivers/watchdog/cadence_wdt.c @@ -0,0 +1,516 @@ +/* + * Cadence WDT driver - Used by Xilinx Zynq + * + * Copyright (C) 2010 - 2014 Xilinx, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CDNS_WDT_DEFAULT_TIMEOUT 10 +/* Supports 1 - 516 sec */ +#define CDNS_WDT_MIN_TIMEOUT 1 +#define CDNS_WDT_MAX_TIMEOUT 516 + +/* Restart key */ +#define CDNS_WDT_RESTART_KEY 0x00001999 + +/* Counter register access key */ +#define CDNS_WDT_REGISTER_ACCESS_KEY 0x00920000 + +/* Counter value divisor */ +#define CDNS_WDT_COUNTER_VALUE_DIVISOR 0x1000 + +/* Clock prescaler value and selection */ +#define CDNS_WDT_PRESCALE_64 64 +#define CDNS_WDT_PRESCALE_512 512 +#define CDNS_WDT_PRESCALE_4096 4096 +#define CDNS_WDT_PRESCALE_SELECT_64 1 +#define CDNS_WDT_PRESCALE_SELECT_512 2 +#define CDNS_WDT_PRESCALE_SELECT_4096 3 + +/* Input clock frequency */ +#define CDNS_WDT_CLK_10MHZ 10000000 +#define CDNS_WDT_CLK_75MHZ 75000000 + +/* Counter maximum value */ +#define CDNS_WDT_COUNTER_MAX 0xFFF + +static int wdt_timeout = CDNS_WDT_DEFAULT_TIMEOUT; +static int nowayout = WATCHDOG_NOWAYOUT; + +module_param(wdt_timeout, int, 0); +MODULE_PARM_DESC(wdt_timeout, + "Watchdog time in seconds. (default=" + __MODULE_STRING(CDNS_WDT_DEFAULT_TIMEOUT) ")"); + +module_param(nowayout, int, 0); +MODULE_PARM_DESC(nowayout, + "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +/** + * struct cdns_wdt - Watchdog device structure + * @regs: baseaddress of device + * @rst: reset flag + * @clk: struct clk * of a clock source + * @prescaler: for saving prescaler value + * @ctrl_clksel: counter clock prescaler selection + * @io_lock: spinlock for IO register access + * @cdns_wdt_device: watchdog device structure + * @cdns_wdt_notifier: notifier structure + * + * Structure containing parameters specific to cadence watchdog. + */ +struct cdns_wdt { + void __iomem *regs; + bool rst; + struct clk *clk; + u32 prescaler; + u32 ctrl_clksel; + spinlock_t io_lock; + struct watchdog_device cdns_wdt_device; + struct notifier_block cdns_wdt_notifier; +}; + +/* Write access to Registers */ +static inline void cdns_wdt_writereg(struct cdns_wdt *wdt, u32 offset, u32 val) +{ + writel_relaxed(val, wdt->regs + offset); +} + +/*************************Register Map**************************************/ + +/* Register Offsets for the WDT */ +#define CDNS_WDT_ZMR_OFFSET 0x0 /* Zero Mode Register */ +#define CDNS_WDT_CCR_OFFSET 0x4 /* Counter Control Register */ +#define CDNS_WDT_RESTART_OFFSET 0x8 /* Restart Register */ +#define CDNS_WDT_SR_OFFSET 0xC /* Status Register */ + +/* + * Zero Mode Register - This register controls how the time out is indicated + * and also contains the access code to allow writes to the register (0xABC). + */ +#define CDNS_WDT_ZMR_WDEN_MASK 0x00000001 /* Enable the WDT */ +#define CDNS_WDT_ZMR_RSTEN_MASK 0x00000002 /* Enable the reset output */ +#define CDNS_WDT_ZMR_IRQEN_MASK 0x00000004 /* Enable IRQ output */ +#define CDNS_WDT_ZMR_RSTLEN_16 0x00000030 /* Reset pulse of 16 pclk cycles */ +#define CDNS_WDT_ZMR_ZKEY_VAL 0x00ABC000 /* Access key, 0xABC << 12 */ +/* + * Counter Control register - This register controls how fast the timer runs + * and the reset value and also contains the access code to allow writes to + * the register. + */ +#define CDNS_WDT_CCR_CRV_MASK 0x00003FFC /* Counter reset value */ + +/** + * cdns_wdt_stop - Stop the watchdog. + * + * @wdd: watchdog device + * + * Read the contents of the ZMR register, clear the WDEN bit + * in the register and set the access key for successful write. + * + * Return: always 0 + */ +static int cdns_wdt_stop(struct watchdog_device *wdd) +{ + struct cdns_wdt *wdt = watchdog_get_drvdata(wdd); + + spin_lock(&wdt->io_lock); + cdns_wdt_writereg(wdt, CDNS_WDT_ZMR_OFFSET, + CDNS_WDT_ZMR_ZKEY_VAL & (~CDNS_WDT_ZMR_WDEN_MASK)); + spin_unlock(&wdt->io_lock); + + return 0; +} + +/** + * cdns_wdt_reload - Reload the watchdog timer (i.e. pat the watchdog). + * + * @wdd: watchdog device + * + * Write the restart key value (0x00001999) to the restart register. + * + * Return: always 0 + */ +static int cdns_wdt_reload(struct watchdog_device *wdd) +{ + struct cdns_wdt *wdt = watchdog_get_drvdata(wdd); + + spin_lock(&wdt->io_lock); + cdns_wdt_writereg(wdt, CDNS_WDT_RESTART_OFFSET, + CDNS_WDT_RESTART_KEY); + spin_unlock(&wdt->io_lock); + + return 0; +} + +/** + * cdns_wdt_start - Enable and start the watchdog. + * + * @wdd: watchdog device + * + * The counter value is calculated according to the formula: + * calculated count = (timeout * clock) / prescaler + 1. + * The calculated count is divided by 0x1000 to obtain the field value + * to write to counter control register. + * Clears the contents of prescaler and counter reset value. Sets the + * prescaler to 4096 and the calculated count and access key + * to write to CCR Register. + * Sets the WDT (WDEN bit) and either the Reset signal(RSTEN bit) + * or Interrupt signal(IRQEN) with a specified cycles and the access + * key to write to ZMR Register. + * + * Return: always 0 + */ +static int cdns_wdt_start(struct watchdog_device *wdd) +{ + struct cdns_wdt *wdt = watchdog_get_drvdata(wdd); + unsigned int data = 0; + unsigned short count; + unsigned long clock_f = clk_get_rate(wdt->clk); + + /* + * Counter value divisor to obtain the value of + * counter reset to be written to control register. + */ + count = (wdd->timeout * (clock_f / wdt->prescaler)) / + CDNS_WDT_COUNTER_VALUE_DIVISOR + 1; + + if (count > CDNS_WDT_COUNTER_MAX) + count = CDNS_WDT_COUNTER_MAX; + + spin_lock(&wdt->io_lock); + cdns_wdt_writereg(wdt, CDNS_WDT_ZMR_OFFSET, + CDNS_WDT_ZMR_ZKEY_VAL); + + count = (count << 2) & CDNS_WDT_CCR_CRV_MASK; + + /* Write counter access key first to be able write to register */ + data = count | CDNS_WDT_REGISTER_ACCESS_KEY | wdt->ctrl_clksel; + cdns_wdt_writereg(wdt, CDNS_WDT_CCR_OFFSET, data); + data = CDNS_WDT_ZMR_WDEN_MASK | CDNS_WDT_ZMR_RSTLEN_16 | + CDNS_WDT_ZMR_ZKEY_VAL; + + /* Reset on timeout if specified in device tree. */ + if (wdt->rst) { + data |= CDNS_WDT_ZMR_RSTEN_MASK; + data &= ~CDNS_WDT_ZMR_IRQEN_MASK; + } else { + data &= ~CDNS_WDT_ZMR_RSTEN_MASK; + data |= CDNS_WDT_ZMR_IRQEN_MASK; + } + cdns_wdt_writereg(wdt, CDNS_WDT_ZMR_OFFSET, data); + cdns_wdt_writereg(wdt, CDNS_WDT_RESTART_OFFSET, + CDNS_WDT_RESTART_KEY); + spin_unlock(&wdt->io_lock); + + return 0; +} + +/** + * cdns_wdt_settimeout - Set a new timeout value for the watchdog device. + * + * @wdd: watchdog device + * @new_time: new timeout value that needs to be set + * Return: 0 on success + * + * Update the watchdog_device timeout with new value which is used when + * cdns_wdt_start is called. + */ +static int cdns_wdt_settimeout(struct watchdog_device *wdd, + unsigned int new_time) +{ + wdd->timeout = new_time; + + return cdns_wdt_start(wdd); +} + +/** + * cdns_wdt_irq_handler - Notifies of watchdog timeout. + * + * @irq: interrupt number + * @dev_id: pointer to a platform device structure + * Return: IRQ_HANDLED + * + * The handler is invoked when the watchdog times out and a + * reset on timeout has not been enabled. + */ +static irqreturn_t cdns_wdt_irq_handler(int irq, void *dev_id) +{ + struct platform_device *pdev = dev_id; + + dev_info(&pdev->dev, + "Watchdog timed out. Internal reset not enabled\n"); + + return IRQ_HANDLED; +} + +/* + * Info structure used to indicate the features supported by the device + * to the upper layers. This is defined in watchdog.h header file. + */ +static struct watchdog_info cdns_wdt_info = { + .identity = "cdns_wdt watchdog", + .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | + WDIOF_MAGICCLOSE, +}; + +/* Watchdog Core Ops */ +static struct watchdog_ops cdns_wdt_ops = { + .owner = THIS_MODULE, + .start = cdns_wdt_start, + .stop = cdns_wdt_stop, + .ping = cdns_wdt_reload, + .set_timeout = cdns_wdt_settimeout, +}; + +/** + * cdns_wdt_notify_sys - Notifier for reboot or shutdown. + * + * @this: handle to notifier block + * @code: turn off indicator + * @unused: unused + * Return: NOTIFY_DONE + * + * This notifier is invoked whenever the system reboot or shutdown occur + * because we need to disable the WDT before system goes down as WDT might + * reset on the next boot. + */ +static int cdns_wdt_notify_sys(struct notifier_block *this, unsigned long code, + void *unused) +{ + struct cdns_wdt *wdt = container_of(this, struct cdns_wdt, + cdns_wdt_notifier); + if (code == SYS_DOWN || code == SYS_HALT) + cdns_wdt_stop(&wdt->cdns_wdt_device); + + return NOTIFY_DONE; +} + +/************************Platform Operations*****************************/ +/** + * cdns_wdt_probe - Probe call for the device. + * + * @pdev: handle to the platform device structure. + * Return: 0 on success, negative error otherwise. + * + * It does all the memory allocation and registration for the device. + */ +static int cdns_wdt_probe(struct platform_device *pdev) +{ + struct resource *res; + int ret, irq; + unsigned long clock_f; + struct cdns_wdt *wdt; + struct watchdog_device *cdns_wdt_device; + + wdt = devm_kzalloc(&pdev->dev, sizeof(*wdt), GFP_KERNEL); + if (!wdt) + return -ENOMEM; + + cdns_wdt_device = &wdt->cdns_wdt_device; + cdns_wdt_device->info = &cdns_wdt_info; + cdns_wdt_device->ops = &cdns_wdt_ops; + cdns_wdt_device->timeout = CDNS_WDT_DEFAULT_TIMEOUT; + cdns_wdt_device->min_timeout = CDNS_WDT_MIN_TIMEOUT; + cdns_wdt_device->max_timeout = CDNS_WDT_MAX_TIMEOUT; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + wdt->regs = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(wdt->regs)) + return PTR_ERR(wdt->regs); + + /* Register the interrupt */ + wdt->rst = of_property_read_bool(pdev->dev.of_node, "reset-on-timeout"); + irq = platform_get_irq(pdev, 0); + if (!wdt->rst && irq >= 0) { + ret = devm_request_irq(&pdev->dev, irq, cdns_wdt_irq_handler, 0, + pdev->name, pdev); + if (ret) { + dev_err(&pdev->dev, + "cannot register interrupt handler err=%d\n", + ret); + return ret; + } + } + + /* Initialize the members of cdns_wdt structure */ + cdns_wdt_device->parent = &pdev->dev; + + ret = watchdog_init_timeout(cdns_wdt_device, wdt_timeout, &pdev->dev); + if (ret) { + dev_err(&pdev->dev, "unable to set timeout value\n"); + return ret; + } + + watchdog_set_nowayout(cdns_wdt_device, nowayout); + watchdog_set_drvdata(cdns_wdt_device, wdt); + + wdt->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(wdt->clk)) { + dev_err(&pdev->dev, "input clock not found\n"); + ret = PTR_ERR(wdt->clk); + return ret; + } + + ret = clk_prepare_enable(wdt->clk); + if (ret) { + dev_err(&pdev->dev, "unable to enable clock\n"); + return ret; + } + + clock_f = clk_get_rate(wdt->clk); + if (clock_f <= CDNS_WDT_CLK_75MHZ) { + wdt->prescaler = CDNS_WDT_PRESCALE_512; + wdt->ctrl_clksel = CDNS_WDT_PRESCALE_SELECT_512; + } else { + wdt->prescaler = CDNS_WDT_PRESCALE_4096; + wdt->ctrl_clksel = CDNS_WDT_PRESCALE_SELECT_4096; + } + + spin_lock_init(&wdt->io_lock); + + wdt->cdns_wdt_notifier.notifier_call = &cdns_wdt_notify_sys; + ret = register_reboot_notifier(&wdt->cdns_wdt_notifier); + if (ret != 0) { + dev_err(&pdev->dev, "cannot register reboot notifier err=%d)\n", + ret); + goto err_clk_disable; + } + + ret = watchdog_register_device(cdns_wdt_device); + if (ret) { + dev_err(&pdev->dev, "Failed to register wdt device\n"); + goto err_clk_disable; + } + platform_set_drvdata(pdev, wdt); + + dev_dbg(&pdev->dev, "Xilinx Watchdog Timer at %p with timeout %ds%s\n", + wdt->regs, cdns_wdt_device->timeout, + nowayout ? ", nowayout" : ""); + + return 0; + +err_clk_disable: + clk_disable_unprepare(wdt->clk); + + return ret; +} + +/** + * cdns_wdt_remove - Probe call for the device. + * + * @pdev: handle to the platform device structure. + * Return: 0 on success, otherwise negative error. + * + * Unregister the device after releasing the resources. + */ +static int cdns_wdt_remove(struct platform_device *pdev) +{ + struct cdns_wdt *wdt = platform_get_drvdata(pdev); + + cdns_wdt_stop(&wdt->cdns_wdt_device); + watchdog_unregister_device(&wdt->cdns_wdt_device); + unregister_reboot_notifier(&wdt->cdns_wdt_notifier); + clk_disable_unprepare(wdt->clk); + + return 0; +} + +/** + * cdns_wdt_shutdown - Stop the device. + * + * @pdev: handle to the platform structure. + * + */ +static void cdns_wdt_shutdown(struct platform_device *pdev) +{ + struct cdns_wdt *wdt = platform_get_drvdata(pdev); + + cdns_wdt_stop(&wdt->cdns_wdt_device); + clk_disable_unprepare(wdt->clk); +} + +/** + * cdns_wdt_suspend - Stop the device. + * + * @dev: handle to the device structure. + * Return: 0 always. + */ +static int __maybe_unused cdns_wdt_suspend(struct device *dev) +{ + struct platform_device *pdev = container_of(dev, + struct platform_device, dev); + struct cdns_wdt *wdt = platform_get_drvdata(pdev); + + cdns_wdt_stop(&wdt->cdns_wdt_device); + clk_disable_unprepare(wdt->clk); + + return 0; +} + +/** + * cdns_wdt_resume - Resume the device. + * + * @dev: handle to the device structure. + * Return: 0 on success, errno otherwise. + */ +static int __maybe_unused cdns_wdt_resume(struct device *dev) +{ + int ret; + struct platform_device *pdev = container_of(dev, + struct platform_device, dev); + struct cdns_wdt *wdt = platform_get_drvdata(pdev); + + ret = clk_prepare_enable(wdt->clk); + if (ret) { + dev_err(dev, "unable to enable clock\n"); + return ret; + } + cdns_wdt_start(&wdt->cdns_wdt_device); + + return 0; +} + +static SIMPLE_DEV_PM_OPS(cdns_wdt_pm_ops, cdns_wdt_suspend, cdns_wdt_resume); + +static struct of_device_id cdns_wdt_of_match[] = { + { .compatible = "cdns,wdt-r1p2", }, + { /* end of table */ } +}; +MODULE_DEVICE_TABLE(of, cdns_wdt_of_match); + +/* Driver Structure */ +static struct platform_driver cdns_wdt_driver = { + .probe = cdns_wdt_probe, + .remove = cdns_wdt_remove, + .shutdown = cdns_wdt_shutdown, + .driver = { + .name = "cdns-wdt", + .owner = THIS_MODULE, + .of_match_table = cdns_wdt_of_match, + .pm = &cdns_wdt_pm_ops, + }, +}; + +module_platform_driver(cdns_wdt_driver); + +MODULE_AUTHOR("Xilinx, Inc."); +MODULE_DESCRIPTION("Watchdog driver for Cadence WDT"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 191891c0378f44aec8e06e889a08d0b76fe6c5cb Mon Sep 17 00:00:00 2001 From: Harini Katakam Date: Fri, 22 Aug 2014 14:58:02 +0530 Subject: devicetree: Add Cadence WDT devicetree bindings documentation Add cadence-wdt bindings documentation. Signed-off-by: Harini Katakam Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../devicetree/bindings/watchdog/cadence-wdt.txt | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 Documentation/devicetree/bindings/watchdog/cadence-wdt.txt diff --git a/Documentation/devicetree/bindings/watchdog/cadence-wdt.txt b/Documentation/devicetree/bindings/watchdog/cadence-wdt.txt new file mode 100644 index 000000000000..c3a36ee45552 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/cadence-wdt.txt @@ -0,0 +1,24 @@ +Zynq Watchdog Device Tree Bindings +------------------------------------------- + +Required properties: +- compatible : Should be "cdns,wdt-r1p2". +- clocks : This is pclk (APB clock). +- interrupts : This is wd_irq - watchdog timeout interrupt. +- interrupt-parent : Must be core interrupt controller. + +Optional properties +- reset-on-timeout : If this property exists, then a reset is done + when watchdog times out. +- timeout-sec : Watchdog timeout value (in seconds). + +Example: + watchdog@f8005000 { + compatible = "cdns,wdt-r1p2"; + clocks = <&clkc 45>; + interrupt-parent = <&intc>; + interrupts = <0 9 1>; + reg = <0xf8005000 0x1000>; + reset-on-timeout; + timeout-sec = <10>; + }; -- cgit v1.2.3 From dfa07141e7a792aecf98a8a99dd40df0bf91bce2 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 23 Sep 2014 15:42:11 +0800 Subject: watchdog: dw_wdt: initialise TOP_INIT in dw_wdt_set_top() The TOP_INIT, ie bit 4-7 of the WDOG_TIMEOUT_RANGE_REG_OFFSET register may be zero, so the timeout period may be very short after initialization is done, thus the system may be reset soon after enabling. We fix this problem by also initialising the TOP_INIT when setting TOP in function dw_wdt_set_top(). Signed-off-by: Jisheng Zhang Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/dw_wdt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/dw_wdt.c b/drivers/watchdog/dw_wdt.c index 9f210299de24..449c88523364 100644 --- a/drivers/watchdog/dw_wdt.c +++ b/drivers/watchdog/dw_wdt.c @@ -40,6 +40,7 @@ #define WDOG_CONTROL_REG_OFFSET 0x00 #define WDOG_CONTROL_REG_WDT_EN_MASK 0x01 #define WDOG_TIMEOUT_RANGE_REG_OFFSET 0x04 +#define WDOG_TIMEOUT_RANGE_TOPINIT_SHIFT 4 #define WDOG_CURRENT_COUNT_REG_OFFSET 0x08 #define WDOG_COUNTER_RESTART_REG_OFFSET 0x0c #define WDOG_COUNTER_RESTART_KICK_VALUE 0x76 @@ -106,7 +107,8 @@ static int dw_wdt_set_top(unsigned top_s) } /* Set the new value in the watchdog. */ - writel(top_val, dw_wdt.regs + WDOG_TIMEOUT_RANGE_REG_OFFSET); + writel(top_val | top_val << WDOG_TIMEOUT_RANGE_TOPINIT_SHIFT, + dw_wdt.regs + WDOG_TIMEOUT_RANGE_REG_OFFSET); dw_wdt_set_next_heartbeat(); -- cgit v1.2.3 From 1094ebe9d1e1dde0754ff8cede16159fb20b2f3b Mon Sep 17 00:00:00 2001 From: Josh Cartwright Date: Thu, 25 Sep 2014 17:51:02 -0500 Subject: watchdog: qcom: add support for KPSS WDT Add a driver for the watchdog timer block found in the Krait Processor Subsystem (KPSS) on the MSM8960, APQ8064, and IPQ8064. Signed-off-by: Josh Cartwright Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 13 ++++ drivers/watchdog/Makefile | 1 + drivers/watchdog/qcom-wdt.c | 186 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 200 insertions(+) create mode 100644 drivers/watchdog/qcom-wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 786f8c8338fb..17b39b8a2520 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -461,6 +461,19 @@ config TEGRA_WATCHDOG To compile this driver as a module, choose M here: the module will be called tegra_wdt. +config QCOM_WDT + tristate "QCOM watchdog" + depends on HAS_IOMEM + depends on ARCH_QCOM + select WATCHDOG_CORE + help + Say Y here to include Watchdog timer support for the watchdog found + on QCOM chipsets. Currently supported targets are the MSM8960, + APQ8064, and IPQ8064. + + To compile this driver as a module, choose M here: the + module will be called qcom_wdt. + # AVR32 Architecture config AT32AP700X_WDT diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 82af9956f96a..44f5d6842fc6 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -58,6 +58,7 @@ obj-$(CONFIG_RETU_WATCHDOG) += retu_wdt.o obj-$(CONFIG_BCM2835_WDT) += bcm2835_wdt.o obj-$(CONFIG_MOXART_WDT) += moxart_wdt.o obj-$(CONFIG_SIRFSOC_WATCHDOG) += sirfsoc_wdt.o +obj-$(CONFIG_QCOM_WDT) += qcom-wdt.o obj-$(CONFIG_BCM_KONA_WDT) += bcm_kona_wdt.o obj-$(CONFIG_TEGRA_WATCHDOG) += tegra_wdt.o diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c new file mode 100644 index 000000000000..68db322341bc --- /dev/null +++ b/drivers/watchdog/qcom-wdt.c @@ -0,0 +1,186 @@ +/* Copyright (c) 2014, The Linux Foundation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include +#include +#include +#include +#include +#include +#include + +#define WDT_RST 0x0 +#define WDT_EN 0x8 +#define WDT_BITE_TIME 0x24 + +struct qcom_wdt { + struct watchdog_device wdd; + struct clk *clk; + unsigned long rate; + void __iomem *base; +}; + +static inline +struct qcom_wdt *to_qcom_wdt(struct watchdog_device *wdd) +{ + return container_of(wdd, struct qcom_wdt, wdd); +} + +static int qcom_wdt_start(struct watchdog_device *wdd) +{ + struct qcom_wdt *wdt = to_qcom_wdt(wdd); + + writel(0, wdt->base + WDT_EN); + writel(1, wdt->base + WDT_RST); + writel(wdd->timeout * wdt->rate, wdt->base + WDT_BITE_TIME); + writel(1, wdt->base + WDT_EN); + return 0; +} + +static int qcom_wdt_stop(struct watchdog_device *wdd) +{ + struct qcom_wdt *wdt = to_qcom_wdt(wdd); + + writel(0, wdt->base + WDT_EN); + return 0; +} + +static int qcom_wdt_ping(struct watchdog_device *wdd) +{ + struct qcom_wdt *wdt = to_qcom_wdt(wdd); + + writel(1, wdt->base + WDT_RST); + return 0; +} + +static int qcom_wdt_set_timeout(struct watchdog_device *wdd, + unsigned int timeout) +{ + wdd->timeout = timeout; + return qcom_wdt_start(wdd); +} + +static const struct watchdog_ops qcom_wdt_ops = { + .start = qcom_wdt_start, + .stop = qcom_wdt_stop, + .ping = qcom_wdt_ping, + .set_timeout = qcom_wdt_set_timeout, + .owner = THIS_MODULE, +}; + +static const struct watchdog_info qcom_wdt_info = { + .options = WDIOF_KEEPALIVEPING + | WDIOF_MAGICCLOSE + | WDIOF_SETTIMEOUT, + .identity = KBUILD_MODNAME, +}; + +static int qcom_wdt_probe(struct platform_device *pdev) +{ + struct qcom_wdt *wdt; + struct resource *res; + int ret; + + wdt = devm_kzalloc(&pdev->dev, sizeof(*wdt), GFP_KERNEL); + if (!wdt) + return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + wdt->base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(wdt->base)) + return PTR_ERR(wdt->base); + + wdt->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(wdt->clk)) { + dev_err(&pdev->dev, "failed to get input clock\n"); + return PTR_ERR(wdt->clk); + } + + ret = clk_prepare_enable(wdt->clk); + if (ret) { + dev_err(&pdev->dev, "failed to setup clock\n"); + return ret; + } + + /* + * We use the clock rate to calculate the max timeout, so ensure it's + * not zero to avoid a divide-by-zero exception. + * + * WATCHDOG_CORE assumes units of seconds, if the WDT is clocked such + * that it would bite before a second elapses it's usefulness is + * limited. Bail if this is the case. + */ + wdt->rate = clk_get_rate(wdt->clk); + if (wdt->rate == 0 || + wdt->rate > 0x10000000U) { + dev_err(&pdev->dev, "invalid clock rate\n"); + ret = -EINVAL; + goto err_clk_unprepare; + } + + wdt->wdd.dev = &pdev->dev; + wdt->wdd.info = &qcom_wdt_info; + wdt->wdd.ops = &qcom_wdt_ops; + wdt->wdd.min_timeout = 1; + wdt->wdd.max_timeout = 0x10000000U / wdt->rate; + + /* + * If 'timeout-sec' unspecified in devicetree, assume a 30 second + * default, unless the max timeout is less than 30 seconds, then use + * the max instead. + */ + wdt->wdd.timeout = min(wdt->wdd.max_timeout, 30U); + watchdog_init_timeout(&wdt->wdd, 0, &pdev->dev); + + ret = watchdog_register_device(&wdt->wdd); + if (ret) { + dev_err(&pdev->dev, "failed to register watchdog\n"); + goto err_clk_unprepare; + } + + platform_set_drvdata(pdev, wdt); + return 0; + +err_clk_unprepare: + clk_disable_unprepare(wdt->clk); + return ret; +} + +static int qcom_wdt_remove(struct platform_device *pdev) +{ + struct qcom_wdt *wdt = platform_get_drvdata(pdev); + + watchdog_unregister_device(&wdt->wdd); + clk_disable_unprepare(wdt->clk); + return 0; +} + +static const struct of_device_id qcom_wdt_of_table[] = { + { .compatible = "qcom,kpss-wdt-msm8960", }, + { .compatible = "qcom,kpss-wdt-apq8064", }, + { .compatible = "qcom,kpss-wdt-ipq8064", }, + { }, +}; +MODULE_DEVICE_TABLE(of, qcom_wdt_of_table); + +static struct platform_driver qcom_watchdog_driver = { + .probe = qcom_wdt_probe, + .remove = qcom_wdt_remove, + .driver = { + .name = KBUILD_MODNAME, + .of_match_table = qcom_wdt_of_table, + }, +}; +module_platform_driver(qcom_watchdog_driver); + +MODULE_DESCRIPTION("QCOM KPSS Watchdog Driver"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 7c92c3d58429c38557ffd7e6a69dc97522335454 Mon Sep 17 00:00:00 2001 From: Josh Cartwright Date: Thu, 25 Sep 2014 17:51:03 -0500 Subject: watchdog: qcom: document device tree bindings The Qualcomm Krait Processor Sub-system (KPSS) contains one or more instances of the WDT. Provide documentation on how to describe these in the device tree. Signed-off-by: Josh Cartwright Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- .../devicetree/bindings/watchdog/qcom-wdt.txt | 24 ++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 Documentation/devicetree/bindings/watchdog/qcom-wdt.txt diff --git a/Documentation/devicetree/bindings/watchdog/qcom-wdt.txt b/Documentation/devicetree/bindings/watchdog/qcom-wdt.txt new file mode 100644 index 000000000000..4726924d034e --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/qcom-wdt.txt @@ -0,0 +1,24 @@ +Qualcomm Krait Processor Sub-system (KPSS) Watchdog +--------------------------------------------------- + +Required properties : +- compatible : shall contain only one of the following: + + "qcom,kpss-wdt-msm8960" + "qcom,kpss-wdt-apq8064" + "qcom,kpss-wdt-ipq8064" + +- reg : shall contain base register location and length +- clocks : shall contain the input clock + +Optional properties : +- timeout-sec : shall contain the default watchdog timeout in seconds, + if unset, the default timeout is 30 seconds + +Example: + watchdog@208a038 { + compatible = "qcom,kpss-wdt-ipq8064"; + reg = <0x0208a038 0x40>; + clocks = <&sleep_clk>; + timeout-sec = <10>; + }; -- cgit v1.2.3 From 2b9366b669679f1388457ec5a62f9dd1d0a78b08 Mon Sep 17 00:00:00 2001 From: Naveen Krishna Chatradhi Date: Wed, 27 Aug 2014 15:17:11 +0530 Subject: watchdog: s3c2410_wdt: Add support for Watchdog device on Exynos7 Exynos7 SoC has a Watchdog for Atlas (A57) cores This patch adds support for the Atlas watchdog. Signed-off-by: Naveen Krishna Chatradhi Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/samsung-wdt.txt | 1 + drivers/watchdog/s3c2410_wdt.c | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt b/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt index cfff37511aac..8f3d96af81d7 100644 --- a/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/samsung-wdt.txt @@ -9,6 +9,7 @@ Required properties: (a) "samsung,s3c2410-wdt" for Exynos4 and previous SoCs (b) "samsung,exynos5250-wdt" for Exynos5250 (c) "samsung,exynos5420-wdt" for Exynos5420 + (c) "samsung,exynos7-wdt" for Exynos7 - reg : base physical address of the controller and length of memory mapped region. diff --git a/drivers/watchdog/s3c2410_wdt.c b/drivers/watchdog/s3c2410_wdt.c index 7c6ccd071baf..015256e496ae 100644 --- a/drivers/watchdog/s3c2410_wdt.c +++ b/drivers/watchdog/s3c2410_wdt.c @@ -155,6 +155,15 @@ static const struct s3c2410_wdt_variant drv_data_exynos5420 = { .quirks = QUIRK_HAS_PMU_CONFIG | QUIRK_HAS_RST_STAT, }; +static const struct s3c2410_wdt_variant drv_data_exynos7 = { + .disable_reg = EXYNOS5_WDT_DISABLE_REG_OFFSET, + .mask_reset_reg = EXYNOS5_WDT_MASK_RESET_REG_OFFSET, + .mask_bit = 0, + .rst_stat_reg = EXYNOS5_RST_STAT_REG_OFFSET, + .rst_stat_bit = 23, /* A57 WDTRESET */ + .quirks = QUIRK_HAS_PMU_CONFIG | QUIRK_HAS_RST_STAT, +}; + static const struct of_device_id s3c2410_wdt_match[] = { { .compatible = "samsung,s3c2410-wdt", .data = &drv_data_s3c2410 }, @@ -162,6 +171,8 @@ static const struct of_device_id s3c2410_wdt_match[] = { .data = &drv_data_exynos5250 }, { .compatible = "samsung,exynos5420-wdt", .data = &drv_data_exynos5420 }, + { .compatible = "samsung,exynos7-wdt", + .data = &drv_data_exynos7 }, {}, }; MODULE_DEVICE_TABLE(of, s3c2410_wdt_match); -- cgit v1.2.3 From 22b1c841e31510c3124c88a13b8a7ada14e2e2d1 Mon Sep 17 00:00:00 2001 From: Beniamino Galvani Date: Mon, 29 Sep 2014 00:39:47 +0200 Subject: watchdog: add driver for Ricoh RN5T618 watchdog This adds a driver for the watchdog timer available in Ricoh RN5T618 PMIC. The device supports a programmable expiration time of 1, 8, 32 or 128 seconds. Signed-off-by: Beniamino Galvani Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 11 +++ drivers/watchdog/Makefile | 1 + drivers/watchdog/rn5t618_wdt.c | 198 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 drivers/watchdog/rn5t618_wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 17b39b8a2520..4edae08800ec 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -327,6 +327,17 @@ config ORION_WATCHDOG To compile this driver as a module, choose M here: the module will be called orion_wdt. +config RN5T618_WATCHDOG + tristate "Ricoh RN5T618 watchdog" + depends on MFD_RN5T618 + select WATCHDOG_CORE + help + If you say yes here you get support for watchdog on the Ricoh + RN5T618 PMIC. + + This driver can also be built as a module. If so, the module + will be called rn5t618_wdt. + config SUNXI_WATCHDOG tristate "Allwinner SoCs watchdog support" depends on ARCH_SUNXI diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index 44f5d6842fc6..f105999da2f3 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_IOP_WATCHDOG) += iop_wdt.o obj-$(CONFIG_DAVINCI_WATCHDOG) += davinci_wdt.o obj-$(CONFIG_ORION_WATCHDOG) += orion_wdt.o obj-$(CONFIG_SUNXI_WATCHDOG) += sunxi_wdt.o +obj-$(CONFIG_RN5T618_WATCHDOG) += rn5t618_wdt.o obj-$(CONFIG_COH901327_WATCHDOG) += coh901327_wdt.o obj-$(CONFIG_STMP3XXX_RTC_WATCHDOG) += stmp3xxx_rtc_wdt.o obj-$(CONFIG_NUC900_WATCHDOG) += nuc900_wdt.o diff --git a/drivers/watchdog/rn5t618_wdt.c b/drivers/watchdog/rn5t618_wdt.c new file mode 100644 index 000000000000..d1c12278cb6a --- /dev/null +++ b/drivers/watchdog/rn5t618_wdt.c @@ -0,0 +1,198 @@ +/* + * Watchdog driver for Ricoh RN5T618 PMIC + * + * Copyright (C) 2014 Beniamino Galvani + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include + +#define DRIVER_NAME "rn5t618-wdt" + +static bool nowayout = WATCHDOG_NOWAYOUT; +static unsigned int timeout; + +module_param(timeout, uint, 0); +MODULE_PARM_DESC(timeout, "Initial watchdog timeout in seconds"); + +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +struct rn5t618_wdt { + struct watchdog_device wdt_dev; + struct rn5t618 *rn5t618; +}; + +/* + * This array encodes the values of WDOGTIM field for the supported + * watchdog expiration times. If the watchdog is not accessed before + * the timer expiration, the PMU generates an interrupt and if the CPU + * doesn't clear it within one second the system is restarted. + */ +static const struct { + u8 reg_val; + unsigned int time; +} rn5t618_wdt_map[] = { + { 0, 1 }, + { 1, 8 }, + { 2, 32 }, + { 3, 128 }, +}; + +static int rn5t618_wdt_set_timeout(struct watchdog_device *wdt_dev, + unsigned int t) +{ + struct rn5t618_wdt *wdt = watchdog_get_drvdata(wdt_dev); + int ret, i; + + for (i = 0; i < ARRAY_SIZE(rn5t618_wdt_map); i++) { + if (rn5t618_wdt_map[i].time + 1 >= t) + break; + } + + if (i == ARRAY_SIZE(rn5t618_wdt_map)) + return -EINVAL; + + ret = regmap_update_bits(wdt->rn5t618->regmap, RN5T618_WATCHDOG, + RN5T618_WATCHDOG_WDOGTIM_M, + rn5t618_wdt_map[i].reg_val); + if (!ret) + wdt_dev->timeout = rn5t618_wdt_map[i].time; + + return ret; +} + +static int rn5t618_wdt_start(struct watchdog_device *wdt_dev) +{ + struct rn5t618_wdt *wdt = watchdog_get_drvdata(wdt_dev); + int ret; + + ret = rn5t618_wdt_set_timeout(wdt_dev, wdt_dev->timeout); + if (ret) + return ret; + + /* enable repower-on */ + ret = regmap_update_bits(wdt->rn5t618->regmap, RN5T618_REPCNT, + RN5T618_REPCNT_REPWRON, + RN5T618_REPCNT_REPWRON); + if (ret) + return ret; + + /* enable watchdog */ + ret = regmap_update_bits(wdt->rn5t618->regmap, RN5T618_WATCHDOG, + RN5T618_WATCHDOG_WDOGEN, + RN5T618_WATCHDOG_WDOGEN); + if (ret) + return ret; + + /* enable watchdog interrupt */ + return regmap_update_bits(wdt->rn5t618->regmap, RN5T618_PWRIREN, + RN5T618_PWRIRQ_IR_WDOG, + RN5T618_PWRIRQ_IR_WDOG); +} + +static int rn5t618_wdt_stop(struct watchdog_device *wdt_dev) +{ + struct rn5t618_wdt *wdt = watchdog_get_drvdata(wdt_dev); + + return regmap_update_bits(wdt->rn5t618->regmap, RN5T618_WATCHDOG, + RN5T618_WATCHDOG_WDOGEN, 0); +} + +static int rn5t618_wdt_ping(struct watchdog_device *wdt_dev) +{ + struct rn5t618_wdt *wdt = watchdog_get_drvdata(wdt_dev); + unsigned int val; + int ret; + + /* The counter is restarted after a R/W access to watchdog register */ + ret = regmap_read(wdt->rn5t618->regmap, RN5T618_WATCHDOG, &val); + if (ret) + return ret; + + ret = regmap_write(wdt->rn5t618->regmap, RN5T618_WATCHDOG, val); + if (ret) + return ret; + + /* Clear pending watchdog interrupt */ + return regmap_update_bits(wdt->rn5t618->regmap, RN5T618_PWRIRQ, + RN5T618_PWRIRQ_IR_WDOG, 0); +} + +static struct watchdog_info rn5t618_wdt_info = { + .options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | + WDIOF_KEEPALIVEPING, + .identity = DRIVER_NAME, +}; + +static struct watchdog_ops rn5t618_wdt_ops = { + .owner = THIS_MODULE, + .start = rn5t618_wdt_start, + .stop = rn5t618_wdt_stop, + .ping = rn5t618_wdt_ping, + .set_timeout = rn5t618_wdt_set_timeout, +}; + +static int rn5t618_wdt_probe(struct platform_device *pdev) +{ + struct rn5t618 *rn5t618 = dev_get_drvdata(pdev->dev.parent); + struct rn5t618_wdt *wdt; + int min_timeout, max_timeout; + + wdt = devm_kzalloc(&pdev->dev, sizeof(struct rn5t618_wdt), GFP_KERNEL); + if (!wdt) + return -ENOMEM; + + min_timeout = rn5t618_wdt_map[0].time; + max_timeout = rn5t618_wdt_map[ARRAY_SIZE(rn5t618_wdt_map) - 1].time; + + wdt->rn5t618 = rn5t618; + wdt->wdt_dev.info = &rn5t618_wdt_info; + wdt->wdt_dev.ops = &rn5t618_wdt_ops; + wdt->wdt_dev.min_timeout = min_timeout; + wdt->wdt_dev.max_timeout = max_timeout; + wdt->wdt_dev.timeout = max_timeout; + wdt->wdt_dev.parent = &pdev->dev; + + watchdog_set_drvdata(&wdt->wdt_dev, wdt); + watchdog_init_timeout(&wdt->wdt_dev, timeout, &pdev->dev); + watchdog_set_nowayout(&wdt->wdt_dev, nowayout); + + platform_set_drvdata(pdev, wdt); + + return watchdog_register_device(&wdt->wdt_dev); +} + +static int rn5t618_wdt_remove(struct platform_device *pdev) +{ + struct rn5t618_wdt *wdt = platform_get_drvdata(pdev); + + watchdog_unregister_device(&wdt->wdt_dev); + + return 0; +} + +static struct platform_driver rn5t618_wdt_driver = { + .probe = rn5t618_wdt_probe, + .remove = rn5t618_wdt_remove, + .driver = { + .name = DRIVER_NAME, + }, +}; + +module_platform_driver(rn5t618_wdt_driver); + +MODULE_AUTHOR("Beniamino Galvani "); +MODULE_DESCRIPTION("RN5T618 watchdog driver"); +MODULE_LICENSE("GPL v2"); -- cgit v1.2.3 From 5e9c16e3760893b3721f599f180795ca7160afef Mon Sep 17 00:00:00 2001 From: Krystian Garbaciak Date: Sun, 28 Sep 2014 19:05:45 +0200 Subject: watchdog: Add DA9063 PMIC watchdog driver. This driver supports the watchdog device inside the DA9063 PMIC. Signed-off-by: Krystian Garbaciak Signed-off-by: Philipp Zabel Signed-off-by: Markus Pargmann Acked-by: Steve Twiss Tested-by: Steve Twiss Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 9 ++ drivers/watchdog/Makefile | 1 + drivers/watchdog/da9063_wdt.c | 191 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 201 insertions(+) create mode 100644 drivers/watchdog/da9063_wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 4edae08800ec..a51ccf3461fb 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -87,6 +87,15 @@ config DA9055_WATCHDOG This driver can also be built as a module. If so, the module will be called da9055_wdt. +config DA9063_WATCHDOG + tristate "Dialog DA9063 Watchdog" + depends on MFD_DA9063 + select WATCHDOG_CORE + help + Support for the watchdog in the DA9063 PMIC. + + This driver can be built as a module. The module name is da9063_wdt. + config GPIO_WATCHDOG tristate "Watchdog device controlled through GPIO-line" depends on OF_GPIO diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index f105999da2f3..f8eaaf45e923 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -176,6 +176,7 @@ obj-$(CONFIG_XEN_WDT) += xen_wdt.o # Architecture Independent obj-$(CONFIG_DA9052_WATCHDOG) += da9052_wdt.o obj-$(CONFIG_DA9055_WATCHDOG) += da9055_wdt.o +obj-$(CONFIG_DA9063_WATCHDOG) += da9063_wdt.o obj-$(CONFIG_GPIO_WATCHDOG) += gpio_wdt.o obj-$(CONFIG_WM831X_WATCHDOG) += wm831x_wdt.o obj-$(CONFIG_WM8350_WATCHDOG) += wm8350_wdt.o diff --git a/drivers/watchdog/da9063_wdt.c b/drivers/watchdog/da9063_wdt.c new file mode 100644 index 000000000000..2cd6b2c2dd2a --- /dev/null +++ b/drivers/watchdog/da9063_wdt.c @@ -0,0 +1,191 @@ +/* + * Watchdog driver for DA9063 PMICs. + * + * Copyright(c) 2012 Dialog Semiconductor Ltd. + * + * Author: Mariusz Wojtasik + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Watchdog selector to timeout in seconds. + * 0: WDT disabled; + * others: timeout = 2048 ms * 2^(TWDSCALE-1). + */ +static const unsigned int wdt_timeout[] = { 0, 2, 4, 8, 16, 32, 65, 131 }; +#define DA9063_TWDSCALE_DISABLE 0 +#define DA9063_TWDSCALE_MIN 1 +#define DA9063_TWDSCALE_MAX (ARRAY_SIZE(wdt_timeout) - 1) +#define DA9063_WDT_MIN_TIMEOUT wdt_timeout[DA9063_TWDSCALE_MIN] +#define DA9063_WDT_MAX_TIMEOUT wdt_timeout[DA9063_TWDSCALE_MAX] +#define DA9063_WDG_TIMEOUT wdt_timeout[3] + +struct da9063_watchdog { + struct da9063 *da9063; + struct watchdog_device wdtdev; +}; + +static unsigned int da9063_wdt_timeout_to_sel(unsigned int secs) +{ + unsigned int i; + + for (i = DA9063_TWDSCALE_MIN; i <= DA9063_TWDSCALE_MAX; i++) { + if (wdt_timeout[i] >= secs) + return i; + } + + return DA9063_TWDSCALE_MAX; +} + +static int _da9063_wdt_set_timeout(struct da9063 *da9063, unsigned int regval) +{ + return regmap_update_bits(da9063->regmap, DA9063_REG_CONTROL_D, + DA9063_TWDSCALE_MASK, regval); +} + +static int da9063_wdt_start(struct watchdog_device *wdd) +{ + struct da9063_watchdog *wdt = watchdog_get_drvdata(wdd); + unsigned int selector; + int ret; + + selector = da9063_wdt_timeout_to_sel(wdt->wdtdev.timeout); + ret = _da9063_wdt_set_timeout(wdt->da9063, selector); + if (ret) + dev_err(wdt->da9063->dev, "Watchdog failed to start (err = %d)\n", + ret); + + return ret; +} + +static int da9063_wdt_stop(struct watchdog_device *wdd) +{ + struct da9063_watchdog *wdt = watchdog_get_drvdata(wdd); + int ret; + + ret = regmap_update_bits(wdt->da9063->regmap, DA9063_REG_CONTROL_D, + DA9063_TWDSCALE_MASK, DA9063_TWDSCALE_DISABLE); + if (ret) + dev_alert(wdt->da9063->dev, "Watchdog failed to stop (err = %d)\n", + ret); + + return ret; +} + +static int da9063_wdt_ping(struct watchdog_device *wdd) +{ + struct da9063_watchdog *wdt = watchdog_get_drvdata(wdd); + int ret; + + ret = regmap_write(wdt->da9063->regmap, DA9063_REG_CONTROL_F, + DA9063_WATCHDOG); + if (ret) + dev_alert(wdt->da9063->dev, "Failed to ping the watchdog (err = %d)\n", + ret); + + return ret; +} + +static int da9063_wdt_set_timeout(struct watchdog_device *wdd, + unsigned int timeout) +{ + struct da9063_watchdog *wdt = watchdog_get_drvdata(wdd); + unsigned int selector; + int ret; + + selector = da9063_wdt_timeout_to_sel(timeout); + ret = _da9063_wdt_set_timeout(wdt->da9063, selector); + if (ret) + dev_err(wdt->da9063->dev, "Failed to set watchdog timeout (err = %d)\n", + ret); + else + wdd->timeout = wdt_timeout[selector]; + + return ret; +} + +static const struct watchdog_info da9063_watchdog_info = { + .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING, + .identity = "DA9063 Watchdog", +}; + +static const struct watchdog_ops da9063_watchdog_ops = { + .owner = THIS_MODULE, + .start = da9063_wdt_start, + .stop = da9063_wdt_stop, + .ping = da9063_wdt_ping, + .set_timeout = da9063_wdt_set_timeout, +}; + +static int da9063_wdt_probe(struct platform_device *pdev) +{ + int ret; + struct da9063 *da9063; + struct da9063_watchdog *wdt; + + if (!pdev->dev.parent) + return -EINVAL; + + da9063 = dev_get_drvdata(pdev->dev.parent); + if (!da9063) + return -EINVAL; + + wdt = devm_kzalloc(&pdev->dev, sizeof(*wdt), GFP_KERNEL); + if (!wdt) + return -ENOMEM; + + wdt->da9063 = da9063; + + wdt->wdtdev.info = &da9063_watchdog_info; + wdt->wdtdev.ops = &da9063_watchdog_ops; + wdt->wdtdev.min_timeout = DA9063_WDT_MIN_TIMEOUT; + wdt->wdtdev.max_timeout = DA9063_WDT_MAX_TIMEOUT; + wdt->wdtdev.timeout = DA9063_WDG_TIMEOUT; + + wdt->wdtdev.status = WATCHDOG_NOWAYOUT_INIT_STATUS; + + watchdog_set_drvdata(&wdt->wdtdev, wdt); + dev_set_drvdata(&pdev->dev, wdt); + + ret = watchdog_register_device(&wdt->wdtdev); + + return ret; +} + +static int da9063_wdt_remove(struct platform_device *pdev) +{ + struct da9063_watchdog *wdt = dev_get_drvdata(&pdev->dev); + + watchdog_unregister_device(&wdt->wdtdev); + + return 0; +} + +static struct platform_driver da9063_wdt_driver = { + .probe = da9063_wdt_probe, + .remove = da9063_wdt_remove, + .driver = { + .name = DA9063_DRVNAME_WATCHDOG, + }, +}; +module_platform_driver(da9063_wdt_driver); + +MODULE_AUTHOR("Mariusz Wojtasik "); +MODULE_DESCRIPTION("Watchdog driver for Dialog DA9063"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:" DA9063_DRVNAME_WATCHDOG); -- cgit v1.2.3 From 3281b85c8697938e344d67144ca8ba520fa54d2b Mon Sep 17 00:00:00 2001 From: Janusz Uzycki Date: Mon, 22 Sep 2014 22:55:47 +0200 Subject: stmp3xxx_rtc_wdt: Add suspend/resume PM support There is no conflict with rtc/rtc-stmp3xxx.c parent because modified registers in PM functions of stmp3xxx_rtc_wdt are different. Signed-off-by: Janusz Uzycki Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/stmp3xxx_rtc_wdt.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/watchdog/stmp3xxx_rtc_wdt.c b/drivers/watchdog/stmp3xxx_rtc_wdt.c index 3804d5e9baea..a62b1b6decf4 100644 --- a/drivers/watchdog/stmp3xxx_rtc_wdt.c +++ b/drivers/watchdog/stmp3xxx_rtc_wdt.c @@ -94,9 +94,33 @@ static int stmp3xxx_wdt_remove(struct platform_device *pdev) return 0; } +static int __maybe_unused stmp3xxx_wdt_suspend(struct device *dev) +{ + struct watchdog_device *wdd = &stmp3xxx_wdd; + + if (watchdog_active(wdd)) + return wdt_stop(wdd); + + return 0; +} + +static int __maybe_unused stmp3xxx_wdt_resume(struct device *dev) +{ + struct watchdog_device *wdd = &stmp3xxx_wdd; + + if (watchdog_active(wdd)) + return wdt_start(wdd); + + return 0; +} + +static SIMPLE_DEV_PM_OPS(stmp3xxx_wdt_pm_ops, + stmp3xxx_wdt_suspend, stmp3xxx_wdt_resume); + static struct platform_driver stmp3xxx_wdt_driver = { .driver = { .name = "stmp3xxx_rtc_wdt", + .pm = &stmp3xxx_wdt_pm_ops, }, .probe = stmp3xxx_wdt_probe, .remove = stmp3xxx_wdt_remove, -- cgit v1.2.3 From 0c5691f00879cacf98a31b873c02d71c66d72855 Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Sat, 20 Sep 2014 19:06:49 +0200 Subject: ARM: docs: add documentation binding for meson watchdog Signed-off-by: Carlo Caione Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- Documentation/devicetree/bindings/watchdog/meson6-wdt.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 Documentation/devicetree/bindings/watchdog/meson6-wdt.txt diff --git a/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt b/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt new file mode 100644 index 000000000000..9200fc2d508c --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/meson6-wdt.txt @@ -0,0 +1,13 @@ +Meson SoCs Watchdog timer + +Required properties: + +- compatible : should be "amlogic,meson6-wdt" +- reg : Specifies base physical address and size of the registers. + +Example: + +wdt: watchdog@c1109900 { + compatible = "amlogic,meson6-wdt"; + reg = <0xc1109900 0x8>; +}; -- cgit v1.2.3 From 22e1b8f60f913cf71e688af9b64317b515303f4c Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Sat, 20 Sep 2014 19:06:50 +0200 Subject: ARM: meson: add watchdog driver This patch adds the watchdog driver for the Amlogic Meson SoCs used also to reboot the device. Signed-off-by: Carlo Caione Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 10 ++ drivers/watchdog/Makefile | 1 + drivers/watchdog/meson_wdt.c | 236 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 247 insertions(+) create mode 100644 drivers/watchdog/meson_wdt.c diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index a51ccf3461fb..9ff8588c0df6 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -494,6 +494,16 @@ config QCOM_WDT To compile this driver as a module, choose M here: the module will be called qcom_wdt. +config MESON_WATCHDOG + tristate "Amlogic Meson SoCs watchdog support" + depends on ARCH_MESON + select WATCHDOG_CORE + help + Say Y here to include support for the watchdog timer + in Amlogic Meson SoCs. + To compile this driver as a module, choose M here: the + module will be called meson_wdt. + # AVR32 Architecture config AT32AP700X_WDT diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index f8eaaf45e923..c569ec8f8a76 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -62,6 +62,7 @@ obj-$(CONFIG_SIRFSOC_WATCHDOG) += sirfsoc_wdt.o obj-$(CONFIG_QCOM_WDT) += qcom-wdt.o obj-$(CONFIG_BCM_KONA_WDT) += bcm_kona_wdt.o obj-$(CONFIG_TEGRA_WATCHDOG) += tegra_wdt.o +obj-$(CONFIG_MESON_WATCHDOG) += meson_wdt.o # AVR32 Architecture obj-$(CONFIG_AT32AP700X_WDT) += at32ap700x_wdt.o diff --git a/drivers/watchdog/meson_wdt.c b/drivers/watchdog/meson_wdt.c new file mode 100644 index 000000000000..37f9f5ec6cb0 --- /dev/null +++ b/drivers/watchdog/meson_wdt.c @@ -0,0 +1,236 @@ +/* + * Meson Watchdog Driver + * + * Copyright (c) 2014 Carlo Caione + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRV_NAME "meson_wdt" + +#define MESON_WDT_TC 0x00 +#define MESON_WDT_TC_EN BIT(22) +#define MESON_WDT_TC_TM_MASK 0x3fffff +#define MESON_WDT_DC_RESET (3 << 24) + +#define MESON_WDT_RESET 0x04 + +#define MESON_WDT_TIMEOUT 30 +#define MESON_WDT_MIN_TIMEOUT 1 +#define MESON_WDT_MAX_TIMEOUT (MESON_WDT_TC_TM_MASK / 100000) + +#define MESON_SEC_TO_TC(s) ((s) * 100000) + +static bool nowayout = WATCHDOG_NOWAYOUT; +static unsigned int timeout = MESON_WDT_TIMEOUT; + +struct meson_wdt_dev { + struct watchdog_device wdt_dev; + void __iomem *wdt_base; + struct notifier_block restart_handler; +}; + +static int meson_restart_handle(struct notifier_block *this, unsigned long mode, + void *cmd) +{ + u32 tc_reboot = MESON_WDT_DC_RESET | MESON_WDT_TC_EN | 100; + struct meson_wdt_dev *meson_wdt = container_of(this, + struct meson_wdt_dev, + restart_handler); + + while (1) { + writel(tc_reboot, meson_wdt->wdt_base + MESON_WDT_TC); + mdelay(5); + } + + return NOTIFY_DONE; +} + +static int meson_wdt_ping(struct watchdog_device *wdt_dev) +{ + struct meson_wdt_dev *meson_wdt = watchdog_get_drvdata(wdt_dev); + + writel(0, meson_wdt->wdt_base + MESON_WDT_RESET); + + return 0; +} + +static void meson_wdt_change_timeout(struct watchdog_device *wdt_dev, + unsigned int timeout) +{ + struct meson_wdt_dev *meson_wdt = watchdog_get_drvdata(wdt_dev); + u32 reg; + + reg = readl(meson_wdt->wdt_base + MESON_WDT_TC); + reg &= ~MESON_WDT_TC_TM_MASK; + reg |= MESON_SEC_TO_TC(timeout); + writel(reg, meson_wdt->wdt_base + MESON_WDT_TC); +} + +static int meson_wdt_set_timeout(struct watchdog_device *wdt_dev, + unsigned int timeout) +{ + wdt_dev->timeout = timeout; + + meson_wdt_change_timeout(wdt_dev, timeout); + meson_wdt_ping(wdt_dev); + + return 0; +} + +static int meson_wdt_stop(struct watchdog_device *wdt_dev) +{ + struct meson_wdt_dev *meson_wdt = watchdog_get_drvdata(wdt_dev); + u32 reg; + + reg = readl(meson_wdt->wdt_base + MESON_WDT_TC); + reg &= ~MESON_WDT_TC_EN; + writel(reg, meson_wdt->wdt_base + MESON_WDT_TC); + + return 0; +} + +static int meson_wdt_start(struct watchdog_device *wdt_dev) +{ + struct meson_wdt_dev *meson_wdt = watchdog_get_drvdata(wdt_dev); + u32 reg; + + meson_wdt_change_timeout(wdt_dev, meson_wdt->wdt_dev.timeout); + meson_wdt_ping(wdt_dev); + + reg = readl(meson_wdt->wdt_base + MESON_WDT_TC); + reg |= MESON_WDT_TC_EN; + writel(reg, meson_wdt->wdt_base + MESON_WDT_TC); + + return 0; +} + +static const struct watchdog_info meson_wdt_info = { + .identity = DRV_NAME, + .options = WDIOF_SETTIMEOUT | + WDIOF_KEEPALIVEPING | + WDIOF_MAGICCLOSE, +}; + +static const struct watchdog_ops meson_wdt_ops = { + .owner = THIS_MODULE, + .start = meson_wdt_start, + .stop = meson_wdt_stop, + .ping = meson_wdt_ping, + .set_timeout = meson_wdt_set_timeout, +}; + +static int meson_wdt_probe(struct platform_device *pdev) +{ + struct resource *res; + struct meson_wdt_dev *meson_wdt; + int err; + + meson_wdt = devm_kzalloc(&pdev->dev, sizeof(*meson_wdt), GFP_KERNEL); + if (!meson_wdt) + return -ENOMEM; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + meson_wdt->wdt_base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(meson_wdt->wdt_base)) + return PTR_ERR(meson_wdt->wdt_base); + + meson_wdt->wdt_dev.parent = &pdev->dev; + meson_wdt->wdt_dev.info = &meson_wdt_info; + meson_wdt->wdt_dev.ops = &meson_wdt_ops; + meson_wdt->wdt_dev.timeout = MESON_WDT_TIMEOUT; + meson_wdt->wdt_dev.max_timeout = MESON_WDT_MAX_TIMEOUT; + meson_wdt->wdt_dev.min_timeout = MESON_WDT_MIN_TIMEOUT; + + watchdog_set_drvdata(&meson_wdt->wdt_dev, meson_wdt); + + watchdog_init_timeout(&meson_wdt->wdt_dev, timeout, &pdev->dev); + watchdog_set_nowayout(&meson_wdt->wdt_dev, nowayout); + + meson_wdt_stop(&meson_wdt->wdt_dev); + + err = watchdog_register_device(&meson_wdt->wdt_dev); + if (err) + return err; + + platform_set_drvdata(pdev, meson_wdt); + + meson_wdt->restart_handler.notifier_call = meson_restart_handle; + meson_wdt->restart_handler.priority = 128; + err = register_restart_handler(&meson_wdt->restart_handler); + if (err) + dev_err(&pdev->dev, + "cannot register restart handler (err=%d)\n", err); + + dev_info(&pdev->dev, "Watchdog enabled (timeout=%d sec, nowayout=%d)", + meson_wdt->wdt_dev.timeout, nowayout); + + return 0; +} + +static int meson_wdt_remove(struct platform_device *pdev) +{ + struct meson_wdt_dev *meson_wdt = platform_get_drvdata(pdev); + + unregister_restart_handler(&meson_wdt->restart_handler); + + watchdog_unregister_device(&meson_wdt->wdt_dev); + + return 0; +} + +static void meson_wdt_shutdown(struct platform_device *pdev) +{ + struct meson_wdt_dev *meson_wdt = platform_get_drvdata(pdev); + + meson_wdt_stop(&meson_wdt->wdt_dev); +} + +static const struct of_device_id meson_wdt_dt_ids[] = { + { .compatible = "amlogic,meson6-wdt" }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, meson_wdt_dt_ids); + +static struct platform_driver meson_wdt_driver = { + .probe = meson_wdt_probe, + .remove = meson_wdt_remove, + .shutdown = meson_wdt_shutdown, + .driver = { + .owner = THIS_MODULE, + .name = DRV_NAME, + .of_match_table = meson_wdt_dt_ids, + }, +}; + +module_platform_driver(meson_wdt_driver); + +module_param(timeout, uint, 0); +MODULE_PARM_DESC(timeout, "Watchdog heartbeat in seconds"); + +module_param(nowayout, bool, 0); +MODULE_PARM_DESC(nowayout, + "Watchdog cannot be stopped once started (default=" + __MODULE_STRING(WATCHDOG_NOWAYOUT) ")"); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Carlo Caione "); +MODULE_DESCRIPTION("Meson Watchdog Timer Driver"); -- cgit v1.2.3 From 69a160a0543fd569661048a8692c10afcdb1914b Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Sat, 20 Sep 2014 19:06:52 +0200 Subject: ARM: defconfig: update multi_v7_defconfig Update the multi_v7_defconfig enabling the watchdog driver for Meson SoCs. Signed-off-by: Carlo Caione Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- arch/arm/configs/multi_v7_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 491b7d5523bf..9702b140ae04 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -261,6 +261,7 @@ CONFIG_WATCHDOG=y CONFIG_XILINX_WATCHDOG=y CONFIG_ORION_WATCHDOG=y CONFIG_SUNXI_WATCHDOG=y +CONFIG_MESON_WATCHDOG=y CONFIG_MFD_AS3722=y CONFIG_MFD_BCM590XX=y CONFIG_MFD_CROS_EC=y -- cgit v1.2.3 From 31228f43ab528628c9b5f1351604361aa1d78533 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 23 Sep 2014 15:42:12 +0800 Subject: watchdog: dw_wdt: add restart handler support The kernel core now provides an API to trigger a system restart. Register with it to support restarting the system via. watchdog. Signed-off-by: Jisheng Zhang Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/dw_wdt.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/drivers/watchdog/dw_wdt.c b/drivers/watchdog/dw_wdt.c index 449c88523364..9e577a64ec9e 100644 --- a/drivers/watchdog/dw_wdt.c +++ b/drivers/watchdog/dw_wdt.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -29,9 +30,11 @@ #include #include #include +#include #include #include #include +#include #include #include #include @@ -63,6 +66,7 @@ static struct { unsigned long next_heartbeat; struct timer_list timer; int expect_close; + struct notifier_block restart_handler; } dw_wdt; static inline int dw_wdt_is_enabled(void) @@ -121,6 +125,26 @@ static void dw_wdt_keepalive(void) WDOG_COUNTER_RESTART_REG_OFFSET); } +static int dw_wdt_restart_handle(struct notifier_block *this, + unsigned long mode, void *cmd) +{ + u32 val; + + writel(0, dw_wdt.regs + WDOG_TIMEOUT_RANGE_REG_OFFSET); + val = readl(dw_wdt.regs + WDOG_CONTROL_REG_OFFSET); + if (val & WDOG_CONTROL_REG_WDT_EN_MASK) + writel(WDOG_COUNTER_RESTART_KICK_VALUE, dw_wdt.regs + + WDOG_COUNTER_RESTART_REG_OFFSET); + else + writel(WDOG_CONTROL_REG_WDT_EN_MASK, + dw_wdt.regs + WDOG_CONTROL_REG_OFFSET); + + /* wait for reset to assert... */ + mdelay(500); + + return NOTIFY_DONE; +} + static void dw_wdt_ping(unsigned long data) { if (time_before(jiffies, dw_wdt.next_heartbeat) || @@ -316,6 +340,12 @@ static int dw_wdt_drv_probe(struct platform_device *pdev) if (ret) goto out_disable_clk; + dw_wdt.restart_handler.notifier_call = dw_wdt_restart_handle; + dw_wdt.restart_handler.priority = 128; + ret = register_restart_handler(&dw_wdt.restart_handler); + if (ret) + pr_warn("cannot register restart handler\n"); + dw_wdt_set_next_heartbeat(); setup_timer(&dw_wdt.timer, dw_wdt_ping, 0); mod_timer(&dw_wdt.timer, jiffies + WDT_TIMEOUT); @@ -330,6 +360,8 @@ out_disable_clk: static int dw_wdt_drv_remove(struct platform_device *pdev) { + unregister_restart_handler(&dw_wdt.restart_handler); + misc_deregister(&dw_wdt_miscdev); clk_disable_unprepare(dw_wdt.clk); -- cgit v1.2.3 From f286e1335f579dfd970c7fc3f62b248773a47a5c Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Tue, 19 Aug 2014 17:45:36 -0700 Subject: watchdog: s3c2410: add restart handler On a lot of Samsung systems the watchdog is responsible for restarting the system and until now this code was contained in plat-samsung/watchdog-reset.c. With the introduction of the restart handlers, this code can now move into driver itself, removing the need for arch-specific code. Tested on a S3C2442 based GTA02 Signed-off-by: Heiko Stuebner Signed-off-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck Signed-off-by: Andrew Morton --- drivers/watchdog/s3c2410_wdt.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/watchdog/s3c2410_wdt.c b/drivers/watchdog/s3c2410_wdt.c index 015256e496ae..8532c3e2aea7 100644 --- a/drivers/watchdog/s3c2410_wdt.c +++ b/drivers/watchdog/s3c2410_wdt.c @@ -41,6 +41,8 @@ #include #include #include +#include +#include #define S3C2410_WTCON 0x00 #define S3C2410_WTDAT 0x04 @@ -128,6 +130,7 @@ struct s3c2410_wdt { unsigned long wtdat_save; struct watchdog_device wdt_device; struct notifier_block freq_transition; + struct notifier_block restart_handler; struct s3c2410_wdt_variant *drv_data; struct regmap *pmureg; }; @@ -449,6 +452,31 @@ static inline void s3c2410wdt_cpufreq_deregister(struct s3c2410_wdt *wdt) } #endif +static int s3c2410wdt_restart(struct notifier_block *this, + unsigned long mode, void *cmd) +{ + struct s3c2410_wdt *wdt = container_of(this, struct s3c2410_wdt, + restart_handler); + void __iomem *wdt_base = wdt->reg_base; + + /* disable watchdog, to be safe */ + writel(0, wdt_base + S3C2410_WTCON); + + /* put initial values into count and data */ + writel(0x80, wdt_base + S3C2410_WTCNT); + writel(0x80, wdt_base + S3C2410_WTDAT); + + /* set the watchdog to go and reset... */ + writel(S3C2410_WTCON_ENABLE | S3C2410_WTCON_DIV16 | + S3C2410_WTCON_RSTEN | S3C2410_WTCON_PRESCALE(0x20), + wdt_base + S3C2410_WTCON); + + /* wait for reset to assert... */ + mdelay(500); + + return NOTIFY_DONE; +} + static inline unsigned int s3c2410wdt_get_bootstatus(struct s3c2410_wdt *wdt) { unsigned int rst_stat; @@ -603,6 +631,12 @@ static int s3c2410wdt_probe(struct platform_device *pdev) platform_set_drvdata(pdev, wdt); + wdt->restart_handler.notifier_call = s3c2410wdt_restart; + wdt->restart_handler.priority = 128; + ret = register_restart_handler(&wdt->restart_handler); + if (ret) + pr_err("cannot register restart handler, %d\n", ret); + /* print out a statement of readiness */ wtcon = readl(wdt->reg_base + S3C2410_WTCON); @@ -632,6 +666,8 @@ static int s3c2410wdt_remove(struct platform_device *dev) int ret; struct s3c2410_wdt *wdt = platform_get_drvdata(dev); + unregister_restart_handler(&wdt->restart_handler); + ret = s3c2410wdt_mask_and_disable_reset(wdt, true); if (ret < 0) return ret; -- cgit v1.2.3 From 05e487d905ab29b5756d6d1e47e27eefa6693fb3 Mon Sep 17 00:00:00 2001 From: Josh Cartwright Date: Thu, 25 Sep 2014 17:51:04 -0500 Subject: watchdog: qcom: register a restart notifier The WDT's BITE_TIME warm-reset behavior can be leveraged as a last resort mechanism for triggering chip reset. Usually, other restart methods (such as PS_HOLD) are preferrable for issuing a more complete reset of the chip. As such, keep the priority of the watchdog notifier low. Signed-off-by: Josh Cartwright Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/qcom-wdt.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/drivers/watchdog/qcom-wdt.c b/drivers/watchdog/qcom-wdt.c index 68db322341bc..aa85618c4d03 100644 --- a/drivers/watchdog/qcom-wdt.c +++ b/drivers/watchdog/qcom-wdt.c @@ -11,11 +11,13 @@ * */ #include +#include #include #include #include #include #include +#include #include #define WDT_RST 0x0 @@ -26,6 +28,7 @@ struct qcom_wdt { struct watchdog_device wdd; struct clk *clk; unsigned long rate; + struct notifier_block restart_nb; void __iomem *base; }; @@ -84,6 +87,32 @@ static const struct watchdog_info qcom_wdt_info = { .identity = KBUILD_MODNAME, }; +static int qcom_wdt_restart(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct qcom_wdt *wdt = container_of(nb, struct qcom_wdt, restart_nb); + u32 timeout; + + /* + * Trigger watchdog bite: + * Setup BITE_TIME to be 128ms, and enable WDT. + */ + timeout = 128 * wdt->rate / 1000; + + writel(0, wdt->base + WDT_EN); + writel(1, wdt->base + WDT_RST); + writel(timeout, wdt->base + WDT_BITE_TIME); + writel(1, wdt->base + WDT_EN); + + /* + * Actually make sure the above sequence hits hardware before sleeping. + */ + wmb(); + + msleep(150); + return NOTIFY_DONE; +} + static int qcom_wdt_probe(struct platform_device *pdev) { struct qcom_wdt *wdt; @@ -147,6 +176,14 @@ static int qcom_wdt_probe(struct platform_device *pdev) goto err_clk_unprepare; } + /* + * WDT restart notifier has priority 0 (use as a last resort) + */ + wdt->restart_nb.notifier_call = qcom_wdt_restart; + ret = register_restart_handler(&wdt->restart_nb); + if (ret) + dev_err(&pdev->dev, "failed to setup restart handler\n"); + platform_set_drvdata(pdev, wdt); return 0; @@ -159,6 +196,7 @@ static int qcom_wdt_remove(struct platform_device *pdev) { struct qcom_wdt *wdt = platform_get_drvdata(pdev); + unregister_restart_handler(&wdt->restart_nb); watchdog_unregister_device(&wdt->wdd); clk_disable_unprepare(wdt->clk); return 0; -- cgit v1.2.3 From 334a9d8131254e06685b2af0c0f3cc7b3ec5bd04 Mon Sep 17 00:00:00 2001 From: Jingchang Lu Date: Fri, 12 Sep 2014 15:24:36 +0800 Subject: watchdog: imx2_wdt: add restart handler support Register the watchdog as the system restart function to the new introducing kernel restart call chain in the driver instead of providing the restart in machine desc. This restart handler function is from the mxc_restart() in arch/arm/mach-imx/system.c Signed-off-by: Jingchang Lu Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/imx2_wdt.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c index f37bb05e7ec0..7e12f88bb4a6 100644 --- a/drivers/watchdog/imx2_wdt.c +++ b/drivers/watchdog/imx2_wdt.c @@ -22,14 +22,17 @@ */ #include +#include #include #include #include #include #include #include +#include #include #include +#include #include #include #include @@ -59,6 +62,7 @@ struct imx2_wdt_device { struct regmap *regmap; struct timer_list timer; /* Pings the watchdog when closed */ struct watchdog_device wdog; + struct notifier_block restart_handler; }; static bool nowayout = WATCHDOG_NOWAYOUT; @@ -77,6 +81,31 @@ static const struct watchdog_info imx2_wdt_info = { .options = WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE, }; +static int imx2_restart_handler(struct notifier_block *this, unsigned long mode, + void *cmd) +{ + unsigned int wcr_enable = IMX2_WDT_WCR_WDE; + struct imx2_wdt_device *wdev = container_of(this, + struct imx2_wdt_device, + restart_handler); + /* Assert SRS signal */ + regmap_write(wdev->regmap, 0, wcr_enable); + /* + * Due to imx6q errata ERR004346 (WDOG: WDOG SRS bit requires to be + * written twice), we add another two writes to ensure there must be at + * least two writes happen in the same one 32kHz clock period. We save + * the target check here, since the writes shouldn't be a huge burden + * for other platforms. + */ + regmap_write(wdev->regmap, 0, wcr_enable); + regmap_write(wdev->regmap, 0, wcr_enable); + + /* wait for reset to assert... */ + mdelay(500); + + return NOTIFY_DONE; +} + static inline void imx2_wdt_setup(struct watchdog_device *wdog) { struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog); @@ -251,6 +280,12 @@ static int __init imx2_wdt_probe(struct platform_device *pdev) return ret; } + wdev->restart_handler.notifier_call = imx2_restart_handler; + wdev->restart_handler.priority = 128; + ret = register_restart_handler(&wdev->restart_handler); + if (ret) + dev_err(&pdev->dev, "cannot register restart handler\n"); + dev_info(&pdev->dev, "timeout %d sec (nowayout=%d)\n", wdog->timeout, nowayout); @@ -262,6 +297,8 @@ static int __exit imx2_wdt_remove(struct platform_device *pdev) struct watchdog_device *wdog = platform_get_drvdata(pdev); struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog); + unregister_restart_handler(&wdev->restart_handler); + watchdog_unregister_device(wdog); if (imx2_wdt_is_running(wdev)) { -- cgit v1.2.3 From f2147de334703c7c44372f013d7d466d756e6943 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 22 Sep 2014 00:05:18 +0800 Subject: watchdog: sunxi: support parameterized compatible strings This patch adds support for hardware parameters tied to compatible strings, so similar hardware can reuse the driver. This will be used to support the newer watchdog found in A31 and later SoCs. Differences in the new hardware include separate interrupt lines for each watchdog, and corresponding interrupt control/status registers. Watchdog control registers were also slightly rearranged. Also replace ioread32()/iowrite32() with readl()/writel() in various places changed. Signed-off-by: Chen-Yu Tsai Signed-off-by: Guenter Roeck Acked-by: Heiko Stuebner Signed-off-by: Wim Van Sebroeck Signed-off-by: Andrew Morton --- drivers/watchdog/sunxi_wdt.c | 101 ++++++++++++++++++++++++++++++++----------- 1 file changed, 76 insertions(+), 25 deletions(-) diff --git a/drivers/watchdog/sunxi_wdt.c b/drivers/watchdog/sunxi_wdt.c index 480bb557f353..a1f7113fc1d1 100644 --- a/drivers/watchdog/sunxi_wdt.c +++ b/drivers/watchdog/sunxi_wdt.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -30,15 +31,11 @@ #define WDT_MAX_TIMEOUT 16 #define WDT_MIN_TIMEOUT 1 -#define WDT_MODE_TIMEOUT(n) ((n) << 3) -#define WDT_TIMEOUT_MASK WDT_MODE_TIMEOUT(0x0F) +#define WDT_TIMEOUT_MASK 0x0F -#define WDT_CTRL 0x00 #define WDT_CTRL_RELOAD ((1 << 0) | (0x0a57 << 1)) -#define WDT_MODE 0x04 #define WDT_MODE_EN (1 << 0) -#define WDT_MODE_RST_EN (1 << 1) #define DRV_NAME "sunxi-wdt" #define DRV_VERSION "1.0" @@ -46,15 +43,29 @@ static bool nowayout = WATCHDOG_NOWAYOUT; static unsigned int timeout = WDT_MAX_TIMEOUT; +/* + * This structure stores the register offsets for different variants + * of Allwinner's watchdog hardware. + */ +struct sunxi_wdt_reg { + u8 wdt_ctrl; + u8 wdt_cfg; + u8 wdt_mode; + u8 wdt_timeout_shift; + u8 wdt_reset_mask; + u8 wdt_reset_val; +}; + struct sunxi_wdt_dev { struct watchdog_device wdt_dev; void __iomem *wdt_base; + const struct sunxi_wdt_reg *wdt_regs; struct notifier_block restart_handler; }; /* * wdt_timeout_map maps the watchdog timer interval value in seconds to - * the value of the register WDT_MODE bit 3:6 + * the value of the register WDT_MODE at bits .wdt_timeout_shift ~ +3 * * [timeout seconds] = register value * @@ -82,19 +93,32 @@ static int sunxi_restart_handle(struct notifier_block *this, unsigned long mode, struct sunxi_wdt_dev, restart_handler); void __iomem *wdt_base = sunxi_wdt->wdt_base; + const struct sunxi_wdt_reg *regs = sunxi_wdt->wdt_regs; + u32 val; + + /* Set system reset function */ + val = readl(wdt_base + regs->wdt_cfg); + val &= ~(regs->wdt_reset_mask); + val |= regs->wdt_reset_val; + writel(val, wdt_base + regs->wdt_cfg); - /* Enable timer and set reset bit in the watchdog */ - writel(WDT_MODE_EN | WDT_MODE_RST_EN, wdt_base + WDT_MODE); + /* Set lowest timeout and enable watchdog */ + val = readl(wdt_base + regs->wdt_mode); + val &= ~(WDT_TIMEOUT_MASK << regs->wdt_timeout_shift); + val |= WDT_MODE_EN; + writel(val, wdt_base + regs->wdt_mode); /* * Restart the watchdog. The default (and lowest) interval * value for the watchdog is 0.5s. */ - writel(WDT_CTRL_RELOAD, wdt_base + WDT_CTRL); + writel(WDT_CTRL_RELOAD, wdt_base + regs->wdt_ctrl); while (1) { mdelay(5); - writel(WDT_MODE_EN | WDT_MODE_RST_EN, wdt_base + WDT_MODE); + val = readl(wdt_base + regs->wdt_mode); + val |= WDT_MODE_EN; + writel(val, wdt_base + regs->wdt_mode); } return NOTIFY_DONE; } @@ -103,8 +127,9 @@ static int sunxi_wdt_ping(struct watchdog_device *wdt_dev) { struct sunxi_wdt_dev *sunxi_wdt = watchdog_get_drvdata(wdt_dev); void __iomem *wdt_base = sunxi_wdt->wdt_base; + const struct sunxi_wdt_reg *regs = sunxi_wdt->wdt_regs; - iowrite32(WDT_CTRL_RELOAD, wdt_base + WDT_CTRL); + writel(WDT_CTRL_RELOAD, wdt_base + regs->wdt_ctrl); return 0; } @@ -114,6 +139,7 @@ static int sunxi_wdt_set_timeout(struct watchdog_device *wdt_dev, { struct sunxi_wdt_dev *sunxi_wdt = watchdog_get_drvdata(wdt_dev); void __iomem *wdt_base = sunxi_wdt->wdt_base; + const struct sunxi_wdt_reg *regs = sunxi_wdt->wdt_regs; u32 reg; if (wdt_timeout_map[timeout] == 0) @@ -121,10 +147,10 @@ static int sunxi_wdt_set_timeout(struct watchdog_device *wdt_dev, sunxi_wdt->wdt_dev.timeout = timeout; - reg = ioread32(wdt_base + WDT_MODE); - reg &= ~WDT_TIMEOUT_MASK; - reg |= WDT_MODE_TIMEOUT(wdt_timeout_map[timeout]); - iowrite32(reg, wdt_base + WDT_MODE); + reg = readl(wdt_base + regs->wdt_mode); + reg &= ~(WDT_TIMEOUT_MASK << regs->wdt_timeout_shift); + reg |= wdt_timeout_map[timeout] << regs->wdt_timeout_shift; + writel(reg, wdt_base + regs->wdt_mode); sunxi_wdt_ping(wdt_dev); @@ -135,8 +161,9 @@ static int sunxi_wdt_stop(struct watchdog_device *wdt_dev) { struct sunxi_wdt_dev *sunxi_wdt = watchdog_get_drvdata(wdt_dev); void __iomem *wdt_base = sunxi_wdt->wdt_base; + const struct sunxi_wdt_reg *regs = sunxi_wdt->wdt_regs; - iowrite32(0, wdt_base + WDT_MODE); + writel(0, wdt_base + regs->wdt_mode); return 0; } @@ -146,6 +173,7 @@ static int sunxi_wdt_start(struct watchdog_device *wdt_dev) u32 reg; struct sunxi_wdt_dev *sunxi_wdt = watchdog_get_drvdata(wdt_dev); void __iomem *wdt_base = sunxi_wdt->wdt_base; + const struct sunxi_wdt_reg *regs = sunxi_wdt->wdt_regs; int ret; ret = sunxi_wdt_set_timeout(&sunxi_wdt->wdt_dev, @@ -153,9 +181,16 @@ static int sunxi_wdt_start(struct watchdog_device *wdt_dev) if (ret < 0) return ret; - reg = ioread32(wdt_base + WDT_MODE); - reg |= (WDT_MODE_RST_EN | WDT_MODE_EN); - iowrite32(reg, wdt_base + WDT_MODE); + /* Set system reset function */ + reg = readl(wdt_base + regs->wdt_cfg); + reg &= ~(regs->wdt_reset_mask); + reg |= ~(regs->wdt_reset_val); + writel(reg, wdt_base + regs->wdt_cfg); + + /* Enable watchdog */ + reg = readl(wdt_base + regs->wdt_mode); + reg |= WDT_MODE_EN; + writel(reg, wdt_base + regs->wdt_mode); return 0; } @@ -175,9 +210,25 @@ static const struct watchdog_ops sunxi_wdt_ops = { .set_timeout = sunxi_wdt_set_timeout, }; +static const struct sunxi_wdt_reg sun4i_wdt_reg = { + .wdt_ctrl = 0x00, + .wdt_cfg = 0x04, + .wdt_mode = 0x04, + .wdt_timeout_shift = 3, + .wdt_reset_mask = 0x02, + .wdt_reset_val = 0x02, +}; + +static const struct of_device_id sunxi_wdt_dt_ids[] = { + { .compatible = "allwinner,sun4i-a10-wdt", .data = &sun4i_wdt_reg }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, sunxi_wdt_dt_ids); + static int sunxi_wdt_probe(struct platform_device *pdev) { struct sunxi_wdt_dev *sunxi_wdt; + const struct of_device_id *device; struct resource *res; int err; @@ -187,6 +238,12 @@ static int sunxi_wdt_probe(struct platform_device *pdev) platform_set_drvdata(pdev, sunxi_wdt); + device = of_match_device(sunxi_wdt_dt_ids, &pdev->dev); + if (!device) + return -ENODEV; + + sunxi_wdt->wdt_regs = device->data; + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); sunxi_wdt->wdt_base = devm_ioremap_resource(&pdev->dev, res); if (IS_ERR(sunxi_wdt->wdt_base)) @@ -242,12 +299,6 @@ static void sunxi_wdt_shutdown(struct platform_device *pdev) sunxi_wdt_stop(&sunxi_wdt->wdt_dev); } -static const struct of_device_id sunxi_wdt_dt_ids[] = { - { .compatible = "allwinner,sun4i-a10-wdt" }, - { /* sentinel */ } -}; -MODULE_DEVICE_TABLE(of, sunxi_wdt_dt_ids); - static struct platform_driver sunxi_wdt_driver = { .probe = sunxi_wdt_probe, .remove = sunxi_wdt_remove, -- cgit v1.2.3 From c5ec618fbf83045b9d51679d809ddd45f990fe0a Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 22 Sep 2014 00:05:19 +0800 Subject: watchdog: sunxi: Add A31 watchdog support This patch adds support for the watchdog hardware found in A31 and newer SoCs. This new hardware has registers at different offsets, and the system reset control has been split out of the "mode" register into a new "configuration" register. Differences not supported by this driver include separate interrupt lines for each watchdog, instead of sharing an interrupt line and registers with the timer block. Signed-off-by: Chen-Yu Tsai Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/sunxi_wdt.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/watchdog/sunxi_wdt.c b/drivers/watchdog/sunxi_wdt.c index a1f7113fc1d1..b62301e74e5f 100644 --- a/drivers/watchdog/sunxi_wdt.c +++ b/drivers/watchdog/sunxi_wdt.c @@ -219,8 +219,18 @@ static const struct sunxi_wdt_reg sun4i_wdt_reg = { .wdt_reset_val = 0x02, }; +static const struct sunxi_wdt_reg sun6i_wdt_reg = { + .wdt_ctrl = 0x10, + .wdt_cfg = 0x14, + .wdt_mode = 0x18, + .wdt_timeout_shift = 4, + .wdt_reset_mask = 0x03, + .wdt_reset_val = 0x01, +}; + static const struct of_device_id sunxi_wdt_dt_ids[] = { { .compatible = "allwinner,sun4i-a10-wdt", .data = &sun4i_wdt_reg }, + { .compatible = "allwinner,sun6i-a31-wdt", .data = &sun6i_wdt_reg }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, sunxi_wdt_dt_ids); -- cgit v1.2.3 From 71fd380a6b87f384002feceda39fd670ede7ea5f Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Sun, 5 Oct 2014 09:28:33 +0800 Subject: watchdog: Let XILINX_WATCHDOG and TEGRA_WATCHDOG depend on HAS_IOMEM They need HAS_IOMEM, so let them depend on it, the related error (with allmodconfig under um): MODPOST 1205 modules ERROR: "devm_ioremap_resource" [drivers/watchdog/tegra_wdt.ko] undefined! ERROR: "devm_ioremap_resource" [drivers/watchdog/of_xilinx_wdt.ko] undefined! Signed-off-by: Chen Gang Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 9ff8588c0df6..d0107d424ee4 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -132,6 +132,7 @@ config WM8350_WATCHDOG config XILINX_WATCHDOG tristate "Xilinx Watchdog timer" + depends on HAS_IOMEM select WATCHDOG_CORE help Watchdog driver for the xps_timebase_wdt ip core. @@ -472,7 +473,7 @@ config SIRFSOC_WATCHDOG config TEGRA_WATCHDOG tristate "Tegra watchdog" - depends on ARCH_TEGRA || COMPILE_TEST + depends on (ARCH_TEGRA || COMPILE_TEST) && HAS_IOMEM select WATCHDOG_CORE help Say Y here to include support for the watchdog timer -- cgit v1.2.3 From 06980b24cf9bfcc753a07ee362976169bb869869 Mon Sep 17 00:00:00 2001 From: Carlo Caione Date: Thu, 9 Oct 2014 21:59:16 +0200 Subject: watchdog: meson: remove magic value for reboot This patch removes the magic value used for rebooting the board. This value is useless and leads to a static checker warning as reported by Dan Carpenter. Signed-off-by: Carlo Caione Reviewed-by: Guenter Roeck Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/meson_wdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/watchdog/meson_wdt.c b/drivers/watchdog/meson_wdt.c index 37f9f5ec6cb0..ef6a298e8c45 100644 --- a/drivers/watchdog/meson_wdt.c +++ b/drivers/watchdog/meson_wdt.c @@ -51,7 +51,7 @@ struct meson_wdt_dev { static int meson_restart_handle(struct notifier_block *this, unsigned long mode, void *cmd) { - u32 tc_reboot = MESON_WDT_DC_RESET | MESON_WDT_TC_EN | 100; + u32 tc_reboot = MESON_WDT_DC_RESET | MESON_WDT_TC_EN; struct meson_wdt_dev *meson_wdt = container_of(this, struct meson_wdt_dev, restart_handler); -- cgit v1.2.3 From 5f8b35b6330db14d15fb385cc7b2ccca53dc323e Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 10 Oct 2014 10:39:24 +0800 Subject: ACPICA: Add string for _DDN method name. The _DDN method will be used internally. Signed-off-by: Bob Moore Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- include/acpi/acnames.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/acpi/acnames.h b/include/acpi/acnames.h index f97804bdf1ff..7461327e14e4 100644 --- a/include/acpi/acnames.h +++ b/include/acpi/acnames.h @@ -52,6 +52,7 @@ #define METHOD_NAME__CBA "_CBA" #define METHOD_NAME__CID "_CID" #define METHOD_NAME__CRS "_CRS" +#define METHOD_NAME__DDN "_DDN" #define METHOD_NAME__HID "_HID" #define METHOD_NAME__INI "_INI" #define METHOD_NAME__PLD "_PLD" -- cgit v1.2.3 From f1b697525d5428856eaba2be2ee6dc1cf3efbbbe Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Fri, 10 Oct 2014 10:39:32 +0800 Subject: ACPICA: acpidump: Add ACPI 1.0 RSDP support. The acpidump currently always uses ACPI 2.0 format to dump RSDP, this patch adds ACPI 1.0 RSDP support. Link: https://bugs.acpica.org/show_bug.cgi?id=1097 Link: https://bugs.acpica.org/show_bug.cgi?id=1103 Signed-off-by: Lv Zheng Reported-and-tested-by: Rudolf Marek Reported-and-tested-by: Rafal Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/actables.h | 2 ++ drivers/acpi/acpica/tbxfroot.c | 33 +++++++++++++++++++++++++++++++- tools/power/acpi/tools/acpidump/apdump.c | 2 +- 3 files changed, 35 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/acpica/actables.h b/drivers/acpi/acpica/actables.h index f14882788eee..1afe46e44dac 100644 --- a/drivers/acpi/acpica/actables.h +++ b/drivers/acpi/acpica/actables.h @@ -49,6 +49,8 @@ acpi_status acpi_allocate_root_table(u32 initial_table_count); /* * tbxfroot - Root pointer utilities */ +u32 acpi_tb_get_rsdp_length(struct acpi_table_rsdp *rsdp); + acpi_status acpi_tb_validate_rsdp(struct acpi_table_rsdp *rsdp); u8 *acpi_tb_scan_memory_for_rsdp(u8 *start_address, u32 length); diff --git a/drivers/acpi/acpica/tbxfroot.c b/drivers/acpi/acpica/tbxfroot.c index 65ab8fed3d5e..43a54af2b548 100644 --- a/drivers/acpi/acpica/tbxfroot.c +++ b/drivers/acpi/acpica/tbxfroot.c @@ -48,6 +48,36 @@ #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbxfroot") +/******************************************************************************* + * + * FUNCTION: acpi_tb_get_rsdp_length + * + * PARAMETERS: rsdp - Pointer to RSDP + * + * RETURN: Table length + * + * DESCRIPTION: Get the length of the RSDP + * + ******************************************************************************/ +u32 acpi_tb_get_rsdp_length(struct acpi_table_rsdp *rsdp) +{ + + if (!ACPI_VALIDATE_RSDP_SIG(rsdp->signature)) { + + /* BAD Signature */ + + return (0); + } + + /* "Length" field is available if table version >= 2 */ + + if (rsdp->revision >= 2) { + return (rsdp->length); + } else { + return (ACPI_RSDP_CHECKSUM_LENGTH); + } +} + /******************************************************************************* * * FUNCTION: acpi_tb_validate_rsdp @@ -59,7 +89,8 @@ ACPI_MODULE_NAME("tbxfroot") * DESCRIPTION: Validate the RSDP (ptr) * ******************************************************************************/ -acpi_status acpi_tb_validate_rsdp(struct acpi_table_rsdp *rsdp) + +acpi_status acpi_tb_validate_rsdp(struct acpi_table_rsdp * rsdp) { /* diff --git a/tools/power/acpi/tools/acpidump/apdump.c b/tools/power/acpi/tools/acpidump/apdump.c index 53cee781e24e..24d32968802d 100644 --- a/tools/power/acpi/tools/acpidump/apdump.c +++ b/tools/power/acpi/tools/acpidump/apdump.c @@ -146,7 +146,7 @@ u32 ap_get_table_length(struct acpi_table_header *table) if (ACPI_VALIDATE_RSDP_SIG(table->signature)) { rsdp = ACPI_CAST_PTR(struct acpi_table_rsdp, table); - return (rsdp->length); + return (acpi_tb_get_rsdp_length(rsdp)); } /* Normal ACPI table */ -- cgit v1.2.3 From f19f1a7e12b40c601a475c4fcb09dc0126d4bc51 Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Fri, 10 Oct 2014 10:39:39 +0800 Subject: ACPICA: Events: Reduce indent divergences of events files. This patch reduces indent divergences first in order to reduce human intervention work for the follow-up linuxized event patches. Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/achware.h | 2 +- drivers/acpi/acpica/aclocal.h | 4 ++-- drivers/acpi/acpica/evgpe.c | 23 ++++++++++++----------- drivers/acpi/acpica/evgpeinit.c | 1 + drivers/acpi/acpica/evxface.c | 9 ++++----- drivers/acpi/acpica/evxfgpe.c | 9 ++++----- drivers/acpi/acpica/hwgpe.c | 2 +- 7 files changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/acpi/acpica/achware.h b/drivers/acpi/acpica/achware.h index 2ad2351a9833..c318d3e27893 100644 --- a/drivers/acpi/acpica/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -127,7 +127,7 @@ acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, acpi_status acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info, - acpi_event_status * event_status); + acpi_event_status *event_status); acpi_status acpi_hw_disable_all_gpes(void); diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index 2747279fbe3c..c00e7e41ad75 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -413,8 +413,8 @@ struct acpi_gpe_handler_info { acpi_gpe_handler address; /* Address of handler, if any */ void *context; /* Context to be passed to handler */ struct acpi_namespace_node *method_node; /* Method node for this GPE level (saved) */ - u8 original_flags; /* Original (pre-handler) GPE info */ - u8 originally_enabled; /* True if GPE was originally enabled */ + u8 original_flags; /* Original (pre-handler) GPE info */ + u8 originally_enabled; /* True if GPE was originally enabled */ }; /* Notify info for implicit notify, multiple device objects */ diff --git a/drivers/acpi/acpica/evgpe.c b/drivers/acpi/acpica/evgpe.c index e4ba4dec86af..2095dfb72bcb 100644 --- a/drivers/acpi/acpica/evgpe.c +++ b/drivers/acpi/acpica/evgpe.c @@ -100,13 +100,14 @@ acpi_ev_update_gpe_enable_mask(struct acpi_gpe_event_info *gpe_event_info) * * FUNCTION: acpi_ev_enable_gpe * - * PARAMETERS: gpe_event_info - GPE to enable + * PARAMETERS: gpe_event_info - GPE to enable * * RETURN: Status * * DESCRIPTION: Clear a GPE of stale events and enable it. * ******************************************************************************/ + acpi_status acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) { acpi_status status; @@ -125,6 +126,7 @@ acpi_status acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) } /* Clear the GPE (of stale events) */ + status = acpi_hw_clear_gpe(gpe_event_info); if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); @@ -136,7 +138,6 @@ acpi_status acpi_ev_enable_gpe(struct acpi_gpe_event_info *gpe_event_info) return_ACPI_STATUS(status); } - /******************************************************************************* * * FUNCTION: acpi_ev_add_gpe_reference @@ -212,7 +213,7 @@ acpi_ev_remove_gpe_reference(struct acpi_gpe_event_info *gpe_event_info) if (ACPI_SUCCESS(status)) { status = acpi_hw_low_set_gpe(gpe_event_info, - ACPI_GPE_DISABLE); + ACPI_GPE_DISABLE); } if (ACPI_FAILURE(status)) { @@ -334,7 +335,7 @@ struct acpi_gpe_event_info *acpi_ev_get_gpe_event_info(acpi_handle gpe_device, * ******************************************************************************/ -u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) +u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info *gpe_xrupt_list) { acpi_status status; struct acpi_gpe_block_info *gpe_block; @@ -427,7 +428,7 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) /* Check if there is anything active at all in this register */ - enabled_status_byte = (u8) (status_reg & enable_reg); + enabled_status_byte = (u8)(status_reg & enable_reg); if (!enabled_status_byte) { /* No active GPEs in this register, move on */ @@ -450,7 +451,7 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) acpi_ev_gpe_dispatch(gpe_block-> node, &gpe_block-> - event_info[((acpi_size) i * ACPI_GPE_REGISTER_WIDTH) + j], j + gpe_register_info->base_gpe_number); + event_info[((acpi_size) i * ACPI_GPE_REGISTER_WIDTH) + j], j + gpe_register_info->base_gpe_number); } } } @@ -636,7 +637,7 @@ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_enable_gpe(void *context) * ******************************************************************************/ -acpi_status acpi_ev_finish_gpe(struct acpi_gpe_event_info *gpe_event_info) +acpi_status acpi_ev_finish_gpe(struct acpi_gpe_event_info * gpe_event_info) { acpi_status status; @@ -666,9 +667,9 @@ acpi_status acpi_ev_finish_gpe(struct acpi_gpe_event_info *gpe_event_info) * * FUNCTION: acpi_ev_gpe_dispatch * - * PARAMETERS: gpe_device - Device node. NULL for GPE0/GPE1 - * gpe_event_info - Info for this GPE - * gpe_number - Number relative to the parent GPE block + * PARAMETERS: gpe_device - Device node. NULL for GPE0/GPE1 + * gpe_event_info - Info for this GPE + * gpe_number - Number relative to the parent GPE block * * RETURN: INTERRUPT_HANDLED or INTERRUPT_NOT_HANDLED * @@ -681,7 +682,7 @@ acpi_status acpi_ev_finish_gpe(struct acpi_gpe_event_info *gpe_event_info) u32 acpi_ev_gpe_dispatch(struct acpi_namespace_node *gpe_device, - struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) + struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) { acpi_status status; u32 return_value; diff --git a/drivers/acpi/acpica/evgpeinit.c b/drivers/acpi/acpica/evgpeinit.c index 49fc7effd961..7be928379879 100644 --- a/drivers/acpi/acpica/evgpeinit.c +++ b/drivers/acpi/acpica/evgpeinit.c @@ -424,6 +424,7 @@ acpi_ev_match_gpe_method(acpi_handle obj_handle, } /* Disable the GPE in case it's been enabled already. */ + (void)acpi_hw_low_set_gpe(gpe_event_info, ACPI_GPE_DISABLE); /* diff --git a/drivers/acpi/acpica/evxface.c b/drivers/acpi/acpica/evxface.c index 11e5803b8b41..935fd7693d61 100644 --- a/drivers/acpi/acpica/evxface.c +++ b/drivers/acpi/acpica/evxface.c @@ -786,14 +786,13 @@ acpi_install_gpe_handler(acpi_handle gpe_device, handler->method_node = gpe_event_info->dispatch.method_node; handler->original_flags = (u8)(gpe_event_info->flags & (ACPI_GPE_XRUPT_TYPE_MASK | - ACPI_GPE_DISPATCH_MASK)); + ACPI_GPE_DISPATCH_MASK)); /* * If the GPE is associated with a method, it may have been enabled * automatically during initialization, in which case it has to be * disabled now to avoid spurious execution of the handler. */ - if ((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) && gpe_event_info->runtime_count) { handler->originally_enabled = 1; @@ -808,7 +807,7 @@ acpi_install_gpe_handler(acpi_handle gpe_device, gpe_event_info->flags &= ~(ACPI_GPE_XRUPT_TYPE_MASK | ACPI_GPE_DISPATCH_MASK); - gpe_event_info->flags |= (u8) (type | ACPI_GPE_DISPATCH_HANDLER); + gpe_event_info->flags |= (u8)(type | ACPI_GPE_DISPATCH_HANDLER); acpi_os_release_lock(acpi_gbl_gpe_lock, flags); @@ -893,7 +892,7 @@ acpi_remove_gpe_handler(acpi_handle gpe_device, gpe_event_info->dispatch.method_node = handler->method_node; gpe_event_info->flags &= - ~(ACPI_GPE_XRUPT_TYPE_MASK | ACPI_GPE_DISPATCH_MASK); + ~(ACPI_GPE_XRUPT_TYPE_MASK | ACPI_GPE_DISPATCH_MASK); gpe_event_info->flags |= handler->original_flags; /* @@ -946,7 +945,7 @@ ACPI_EXPORT_SYMBOL(acpi_remove_gpe_handler) * handle is returned. * ******************************************************************************/ -acpi_status acpi_acquire_global_lock(u16 timeout, u32 * handle) +acpi_status acpi_acquire_global_lock(u16 timeout, u32 *handle) { acpi_status status; diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c index 56710a03c9b0..2529d3c1c6b7 100644 --- a/drivers/acpi/acpica/evxfgpe.c +++ b/drivers/acpi/acpica/evxfgpe.c @@ -106,8 +106,8 @@ ACPI_EXPORT_SYMBOL(acpi_update_all_gpes) * * FUNCTION: acpi_enable_gpe * - * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 - * gpe_number - GPE level within the GPE block + * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 + * gpe_number - GPE level within the GPE block * * RETURN: Status * @@ -115,7 +115,6 @@ ACPI_EXPORT_SYMBOL(acpi_update_all_gpes) * hardware-enabled. * ******************************************************************************/ - acpi_status acpi_enable_gpe(acpi_handle gpe_device, u32 gpe_number) { acpi_status status = AE_BAD_PARAMETER; @@ -490,8 +489,8 @@ ACPI_EXPORT_SYMBOL(acpi_clear_gpe) * * FUNCTION: acpi_get_gpe_status * - * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 - * gpe_number - GPE level within the GPE block + * PARAMETERS: gpe_device - Parent GPE Device. NULL for GPE0/GPE1 + * gpe_number - GPE level within the GPE block * event_status - Where the current status of the event * will be returned * diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index ea62d40fd161..aa884d42361c 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -202,7 +202,7 @@ acpi_status acpi_hw_clear_gpe(struct acpi_gpe_event_info * gpe_event_info) acpi_status acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info, - acpi_event_status * event_status) + acpi_event_status *event_status) { u32 in_byte; u32 register_bit; -- cgit v1.2.3 From 437b75123ca5ee36897bfcd1272e96109bed3ea1 Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Fri, 10 Oct 2014 10:39:45 +0800 Subject: ACPICA: Events: Reduce source code difference in acpi_install_gpe_handler(). There is a sanity check in ACPICA upstream, complaining mis-matched interrupt type for originally enabled GPEs that are going to be dispatched by OSPM handlers. This is only a warning message noting developers such conflict between BIOS and OSPM. This patch ports this warning message from ACPICA upstream to reduce source code difference between Linux and ACPICA upstream. Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evxface.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/acpica/evxface.c b/drivers/acpi/acpica/evxface.c index 935fd7693d61..79b6ed204f44 100644 --- a/drivers/acpi/acpica/evxface.c +++ b/drivers/acpi/acpica/evxface.c @@ -795,8 +795,16 @@ acpi_install_gpe_handler(acpi_handle gpe_device, */ if ((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) && gpe_event_info->runtime_count) { - handler->originally_enabled = 1; + handler->originally_enabled = TRUE; (void)acpi_ev_remove_gpe_reference(gpe_event_info); + + /* Sanity check of original type against new type */ + + if (type != + (u32)(gpe_event_info->flags & ACPI_GPE_XRUPT_TYPE_MASK)) { + ACPI_WARNING((AE_INFO, + "GPE type mismatch (level/edge)")); + } } /* Install the handler */ -- cgit v1.2.3 From 1809919a309dc8c8faad3c048bfda9a9f3fa0443 Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Fri, 10 Oct 2014 10:39:51 +0800 Subject: ACPICA: Events: Update GPE handler removal, match behavior of handler install. The originally_enabled check is not paired between acpi_install_gpe_handler() and acpi_remove_gpe_handler(). In ACPICA upstream, there is code to protect original enabled state for ACPI_GPE_DISPATCH_NOTIFY and this commit fixes an issue for this feature. Link: https://github.com/acpica/acpica/commit/967f314c Signed-off-by: Lv Zheng Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evxface.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/acpica/evxface.c b/drivers/acpi/acpica/evxface.c index 79b6ed204f44..55a58f3ec8df 100644 --- a/drivers/acpi/acpica/evxface.c +++ b/drivers/acpi/acpica/evxface.c @@ -793,8 +793,9 @@ acpi_install_gpe_handler(acpi_handle gpe_device, * automatically during initialization, in which case it has to be * disabled now to avoid spurious execution of the handler. */ - if ((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) - && gpe_event_info->runtime_count) { + if (((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) || + (handler->original_flags & ACPI_GPE_DISPATCH_NOTIFY)) && + gpe_event_info->runtime_count) { handler->originally_enabled = TRUE; (void)acpi_ev_remove_gpe_reference(gpe_event_info); @@ -908,7 +909,8 @@ acpi_remove_gpe_handler(acpi_handle gpe_device, * enabled, it should be enabled at this point to restore the * post-initialization configuration. */ - if ((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) && + if (((handler->original_flags & ACPI_GPE_DISPATCH_METHOD) || + (handler->original_flags & ACPI_GPE_DISPATCH_NOTIFY)) && handler->originally_enabled) { (void)acpi_ev_add_gpe_reference(gpe_event_info); } -- cgit v1.2.3 From a08f813e58169a8edd01e13d73f60d8561f3ecea Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Fri, 10 Oct 2014 10:39:57 +0800 Subject: ACPICA: Events: Reduce source code difference for the ACPI_EVENT_FLAG_HANDLE support. This patch is a partial linuxized result of the following ACPICA commit: ACPICA commit: a73b66c6aa1846d055bb6390d9c9b9902f7d804d Subject: Add "has handler" flag to event/gpe status interfaces. This change adds a new flag, ACPI_EVENT_FLAGS_HAS_HANDLER to the acpi_get_event_status and acpi_get_gpe_status external interfaces. It is set if the event/gpe currently has a handler associated with it. This commit back ports ACPI_EVENT_FLAG_HANDLE from Linux upstream to ACPICA, the flag along with its support code currently can only be found in the Linux upstream and is used by the ACPI sysfs GPE interfaces and the ACPI bus scanning support. Link: https://github.com/acpica/acpica/commit/a73b66c6 Signed-off-by: Lv Zheng Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evxfevnt.c | 40 ++++++++++++++++++++++++++-------------- drivers/acpi/acpica/evxfgpe.c | 3 --- drivers/acpi/acpica/hwgpe.c | 7 +++++++ include/acpi/actypes.h | 4 ++-- 4 files changed, 35 insertions(+), 19 deletions(-) diff --git a/drivers/acpi/acpica/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c index e286640ad4ff..a8a077c97187 100644 --- a/drivers/acpi/acpica/evxfevnt.c +++ b/drivers/acpi/acpica/evxfevnt.c @@ -324,8 +324,9 @@ ACPI_EXPORT_SYMBOL(acpi_clear_event) ******************************************************************************/ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) { - acpi_status status = AE_OK; - u32 value; + acpi_status status; + acpi_event_status local_event_status = 0; + u32 in_byte; ACPI_FUNCTION_TRACE(acpi_get_event_status); @@ -339,29 +340,40 @@ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) return_ACPI_STATUS(AE_BAD_PARAMETER); } - /* Get the status of the requested fixed event */ + /* Fixed event currently can be dispatched? */ + + if (acpi_gbl_fixed_event_handlers[event].handler) { + local_event_status |= ACPI_EVENT_FLAG_HANDLE; + } + + /* Fixed event currently enabled? */ status = acpi_read_bit_register(acpi_gbl_fixed_event_info[event]. - enable_register_id, &value); - if (ACPI_FAILURE(status)) + enable_register_id, &in_byte); + if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); + } - *event_status = value; + if (in_byte) { + local_event_status |= ACPI_EVENT_FLAG_ENABLED; + } + + /* Fixed event currently active? */ status = acpi_read_bit_register(acpi_gbl_fixed_event_info[event]. - status_register_id, &value); - if (ACPI_FAILURE(status)) + status_register_id, &in_byte); + if (ACPI_FAILURE(status)) { return_ACPI_STATUS(status); + } - if (value) - *event_status |= ACPI_EVENT_FLAG_SET; - - if (acpi_gbl_fixed_event_handlers[event].handler) - *event_status |= ACPI_EVENT_FLAG_HANDLE; + if (in_byte) { + local_event_status |= ACPI_EVENT_FLAG_SET; + } - return_ACPI_STATUS(status); + (*event_status) = local_event_status; + return_ACPI_STATUS(AE_OK); } ACPI_EXPORT_SYMBOL(acpi_get_event_status) diff --git a/drivers/acpi/acpica/evxfgpe.c b/drivers/acpi/acpica/evxfgpe.c index 2529d3c1c6b7..e889a5304abd 100644 --- a/drivers/acpi/acpica/evxfgpe.c +++ b/drivers/acpi/acpica/evxfgpe.c @@ -523,9 +523,6 @@ acpi_get_gpe_status(acpi_handle gpe_device, status = acpi_hw_get_gpe_status(gpe_event_info, event_status); - if (gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) - *event_status |= ACPI_EVENT_FLAG_HANDLE; - unlock_and_exit: acpi_os_release_lock(acpi_gbl_gpe_lock, flags); return_ACPI_STATUS(status); diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index aa884d42361c..0302bc36287b 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -216,6 +216,13 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info, return (AE_BAD_PARAMETER); } + /* GPE currently handled? */ + + if ((gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) != + ACPI_GPE_DISPATCH_NONE) { + local_event_status |= ACPI_EVENT_FLAG_HANDLE; + } + /* Get the info block for the entire GPE register */ gpe_register_info = gpe_event_info->register_info; diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index ac03ec81d342..857830d8989b 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -721,7 +721,7 @@ typedef u32 acpi_event_type; * | | | +--- Enabled for wake? * | | +----- Set? * | +------- Has a handler? - * +----------- + * +------------- */ typedef u32 acpi_event_status; @@ -729,7 +729,7 @@ typedef u32 acpi_event_status; #define ACPI_EVENT_FLAG_ENABLED (acpi_event_status) 0x01 #define ACPI_EVENT_FLAG_WAKE_ENABLED (acpi_event_status) 0x02 #define ACPI_EVENT_FLAG_SET (acpi_event_status) 0x04 -#define ACPI_EVENT_FLAG_HANDLE (acpi_event_status) 0x08 +#define ACPI_EVENT_FLAG_HANDLE (acpi_event_status) 0x08 /* Actions for acpi_set_gpe, acpi_gpe_wakeup, acpi_hw_low_set_gpe */ -- cgit v1.2.3 From 2f8572344e65296d13c1a771cacfea60916d91dc Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Fri, 10 Oct 2014 10:40:05 +0800 Subject: ACPICA: Events: Reduce source code difference for the ACPI_EVENT_FLAG_HANDLE renaming. This patch is partial linuxized result of the following ACPICA commit: ACPICA commit: a73b66c6aa1846d055bb6390d9c9b9902f7d804d Subject: Add "has handler" flag to event/gpe status interfaces. This change adds a new flag, ACPI_EVENT_FLAGS_HAS_HANDLER to the acpi_get_event_status and acpi_get_gpe_status external interfaces. It is set if the event/gpe currently has a handler associated with it. This patch contains the code to rename ACPI_EVENT_FLAG_HANDLE to ACPI_EVENT_FLAG_HAS_HANDLER, and the corresponding updates of its usages. Link: https://github.com/acpica/acpica/commit/a73b66c6 Signed-off-by: Lv Zheng Signed-off-by: Bob Moore Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/evxfevnt.c | 2 +- drivers/acpi/acpica/hwgpe.c | 2 +- drivers/acpi/scan.c | 2 +- drivers/acpi/sysfs.c | 4 ++-- include/acpi/actypes.h | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/acpica/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c index a8a077c97187..bb8cbf5961bf 100644 --- a/drivers/acpi/acpica/evxfevnt.c +++ b/drivers/acpi/acpica/evxfevnt.c @@ -343,7 +343,7 @@ acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status) /* Fixed event currently can be dispatched? */ if (acpi_gbl_fixed_event_handlers[event].handler) { - local_event_status |= ACPI_EVENT_FLAG_HANDLE; + local_event_status |= ACPI_EVENT_FLAG_HAS_HANDLER; } /* Fixed event currently enabled? */ diff --git a/drivers/acpi/acpica/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index 0302bc36287b..48ac7b7b59cd 100644 --- a/drivers/acpi/acpica/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -220,7 +220,7 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info, if ((gpe_event_info->flags & ACPI_GPE_DISPATCH_MASK) != ACPI_GPE_DISPATCH_NONE) { - local_event_status |= ACPI_EVENT_FLAG_HANDLE; + local_event_status |= ACPI_EVENT_FLAG_HAS_HANDLER; } /* Get the info block for the entire GPE register */ diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index ae44d8654c82..f1d96e7519cb 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1470,7 +1470,7 @@ static void acpi_wakeup_gpe_init(struct acpi_device *device) if (ACPI_FAILURE(status)) return; - wakeup->flags.run_wake = !!(event_status & ACPI_EVENT_FLAG_HANDLE); + wakeup->flags.run_wake = !!(event_status & ACPI_EVENT_FLAG_HAS_HANDLER); } static void acpi_bus_get_wakeup_device_flags(struct acpi_device *device) diff --git a/drivers/acpi/sysfs.c b/drivers/acpi/sysfs.c index 38cb9782d4b8..13e577c80201 100644 --- a/drivers/acpi/sysfs.c +++ b/drivers/acpi/sysfs.c @@ -537,7 +537,7 @@ static ssize_t counter_show(struct kobject *kobj, if (result) goto end; - if (!(status & ACPI_EVENT_FLAG_HANDLE)) + if (!(status & ACPI_EVENT_FLAG_HAS_HANDLER)) size += sprintf(buf + size, " invalid"); else if (status & ACPI_EVENT_FLAG_ENABLED) size += sprintf(buf + size, " enabled"); @@ -581,7 +581,7 @@ static ssize_t counter_set(struct kobject *kobj, if (result) goto end; - if (!(status & ACPI_EVENT_FLAG_HANDLE)) { + if (!(status & ACPI_EVENT_FLAG_HAS_HANDLER)) { printk(KERN_WARNING PREFIX "Can not change Invalid GPE/Fixed Event status\n"); return -EINVAL; diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 857830d8989b..7000e66f768e 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -729,7 +729,7 @@ typedef u32 acpi_event_status; #define ACPI_EVENT_FLAG_ENABLED (acpi_event_status) 0x01 #define ACPI_EVENT_FLAG_WAKE_ENABLED (acpi_event_status) 0x02 #define ACPI_EVENT_FLAG_SET (acpi_event_status) 0x04 -#define ACPI_EVENT_FLAG_HANDLE (acpi_event_status) 0x08 +#define ACPI_EVENT_FLAG_HAS_HANDLER (acpi_event_status) 0x08 /* Actions for acpi_set_gpe, acpi_gpe_wakeup, acpi_hw_low_set_gpe */ -- cgit v1.2.3 From f2d348fac77f6cf8eb5d0eb37e16b13d8b796bda Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 10 Oct 2014 10:40:11 +0800 Subject: ACPICA: iASL/Disassembler: Add support for hardware summary mapfiles. Adds support for both iASL and the disassembler to create a hardware and connection summary mapfile (via the -lm option.) Linux isn't affected by this patch because iASL is not in the Linux upstream. Signed-off-by: Bob Moore Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpica/amlresrc.h | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drivers/acpi/acpica/amlresrc.h b/drivers/acpi/acpica/amlresrc.h index f3f834408441..3a0beeb86ba5 100644 --- a/drivers/acpi/acpica/amlresrc.h +++ b/drivers/acpi/acpica/amlresrc.h @@ -117,6 +117,12 @@ struct asl_resource_node { struct asl_resource_node *next; }; +struct asl_resource_info { + union acpi_parse_object *descriptor_type_op; /* Resource descriptor parse node */ + union acpi_parse_object *mapping_op; /* Used for mapfile support */ + u32 current_byte_offset; /* Offset in resource template */ +}; + /* Macros used to generate AML resource length fields */ #define ACPI_AML_SIZE_LARGE(r) (sizeof (r) - sizeof (struct aml_resource_large_header)) @@ -449,4 +455,32 @@ union aml_resource { u8 byte_item; }; +/* Interfaces used by both the disassembler and compiler */ + +void +mp_save_gpio_info(union acpi_parse_object *op, + union aml_resource *resource, + u32 pin_count, u16 *pin_list, char *device_name); + +void +mp_save_serial_info(union acpi_parse_object *op, + union aml_resource *resource, char *device_name); + +char *mp_get_hid_from_parse_tree(struct acpi_namespace_node *hid_node); + +char *mp_get_hid_via_namestring(char *device_name); + +char *mp_get_connection_info(union acpi_parse_object *op, + u32 pin_index, + struct acpi_namespace_node **target_node, + char **target_name); + +char *mp_get_parent_device_hid(union acpi_parse_object *op, + struct acpi_namespace_node **target_node, + char **parent_device_name); + +char *mp_get_ddn_value(char *device_name); + +char *mp_get_hid_value(struct acpi_namespace_node *device_node); + #endif -- cgit v1.2.3 From 63c9043b905d04edebfe37b59d5415a46e5008bb Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 10 Oct 2014 10:40:19 +0800 Subject: ACPICA: acpiexec: Do not put STDIN into raw mode unless it is a terminal. Eliminate an error message for batch-mode processing on unix systems. ACPICA BZ 1114. This patch is mainly for fixing the issues of acpiexec which is not in the Linux upstream. Link: https://bugs.acpica.org/show_bug.cgi?id=1114 Signed-off-by: Bob Moore Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- tools/power/acpi/os_specific/service_layers/osunixxf.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/power/acpi/os_specific/service_layers/osunixxf.c b/tools/power/acpi/os_specific/service_layers/osunixxf.c index 60b58cd18410..7ccb073f8316 100644 --- a/tools/power/acpi/os_specific/service_layers/osunixxf.c +++ b/tools/power/acpi/os_specific/service_layers/osunixxf.c @@ -122,6 +122,14 @@ static void os_enter_line_edit_mode(void) { struct termios local_term_attributes; + term_attributes_were_set = 0; + + /* STDIN must be a terminal */ + + if (!isatty(STDIN_FILENO)) { + return; + } + /* Get and keep the original attributes */ if (tcgetattr(STDIN_FILENO, &original_term_attributes)) { -- cgit v1.2.3 From 9fc3d1d09cf7f81d5775712dc64c3db4862ee59d Mon Sep 17 00:00:00 2001 From: Bob Moore Date: Fri, 10 Oct 2014 10:40:28 +0800 Subject: ACPICA: Update version to 20140926. Version 20140926. Signed-off-by: Bob Moore Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- include/acpi/acpixf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 9fc1d71c82bc..ab2acf629a64 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -46,7 +46,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20140828 +#define ACPI_CA_VERSION 0x20140926 #include #include -- cgit v1.2.3 From c95f25b03667c50e7184a63bdf4c32dff0de2f6f Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Tue, 14 Oct 2014 14:23:38 +0800 Subject: ACPI / EC: Add CPU ID to debugging messages. This patch adds CPU ID to the context entries' debugging output. no functional changes. Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index cb6066c809ea..9cb4d0c468eb 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -181,7 +181,8 @@ static bool advance_transaction(struct acpi_ec *ec) u8 status; bool wakeup = false; - pr_debug("===== %s =====\n", in_interrupt() ? "IRQ" : "TASK"); + pr_debug("===== %s (%d) =====\n", + in_interrupt() ? "IRQ" : "TASK", smp_processor_id()); status = acpi_ec_read_status(ec); t = ec->curr; if (!t) -- cgit v1.2.3 From 459572a7503bccb5435936488088c8db4f51d3ab Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Tue, 14 Oct 2014 14:23:43 +0800 Subject: ACPI / EC: Enhance the logs to apply to QR_EC transactions. Currently some logs are applied to new transactions, but QR_EC transactions are not included. This patch merges the code path to make the logs also applying to the QR_EC transactions. Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 9cb4d0c468eb..b15a431301a0 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -303,12 +303,15 @@ static int acpi_ec_transaction_unlocked(struct acpi_ec *ec, spin_lock_irqsave(&ec->lock, tmp); /* following two actions should be kept atomic */ ec->curr = t; + pr_debug("transaction start (cmd=0x%02x, addr=0x%02x)\n", + t->command, t->wdata ? t->wdata[0] : 0); start_transaction(ec); spin_unlock_irqrestore(&ec->lock, tmp); ret = ec_poll(ec); spin_lock_irqsave(&ec->lock, tmp); if (ec->curr->command == ACPI_EC_COMMAND_QUERY) clear_bit(EC_FLAGS_QUERY_PENDING, &ec->flags); + pr_debug("transaction end\n"); ec->curr = NULL; spin_unlock_irqrestore(&ec->lock, tmp); return ret; @@ -334,8 +337,6 @@ static int acpi_ec_transaction(struct acpi_ec *ec, struct transaction *t) goto unlock; } } - pr_debug("transaction start (cmd=0x%02x, addr=0x%02x)\n", - t->command, t->wdata ? t->wdata[0] : 0); /* disable GPE during transaction if storm is detected */ if (test_bit(EC_FLAGS_GPE_STORM, &ec->flags)) { /* It has to be disabled, so that it doesn't trigger. */ @@ -356,7 +357,6 @@ static int acpi_ec_transaction(struct acpi_ec *ec, struct transaction *t) t->irq_count); set_bit(EC_FLAGS_GPE_STORM, &ec->flags); } - pr_debug("transaction end\n"); if (ec->global_lock) acpi_release_global_lock(glk); unlock: -- cgit v1.2.3 From e34c0e2bb4046d574224de3752642177a45a067a Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Tue, 14 Oct 2014 14:23:49 +0800 Subject: ACPI / EC: Add detailed command/query debugging information. Developers really don't need to translate EC commands in mind. This patch adds detailed debugging information for the EC commands. The address can be found in the follow-up sequential EC_DATA(W) accesses, thus this patch also removes some of the redundant address information. Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index b15a431301a0..fcf667c3961e 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -164,6 +164,27 @@ static inline void acpi_ec_write_data(struct acpi_ec *ec, u8 data) outb(data, ec->data_addr); } +#ifdef DEBUG +static const char *acpi_ec_cmd_string(u8 cmd) +{ + switch (cmd) { + case 0x80: + return "RD_EC"; + case 0x81: + return "WR_EC"; + case 0x82: + return "BE_EC"; + case 0x83: + return "BD_EC"; + case 0x84: + return "QR_EC"; + } + return "UNKNOWN"; +} +#else +#define acpi_ec_cmd_string(cmd) "UNDEF" +#endif + static int ec_transaction_completed(struct acpi_ec *ec) { unsigned long flags; @@ -199,7 +220,8 @@ static bool advance_transaction(struct acpi_ec *ec) if (t->rlen == t->ri) { t->flags |= ACPI_EC_COMMAND_COMPLETE; if (t->command == ACPI_EC_COMMAND_QUERY) - pr_debug("hardware QR_EC completion\n"); + pr_debug("***** Command(%s) hardware completion *****\n", + acpi_ec_cmd_string(t->command)); wakeup = true; } } else @@ -222,7 +244,8 @@ static bool advance_transaction(struct acpi_ec *ec) t->flags |= ACPI_EC_COMMAND_POLL; t->rdata[t->ri++] = 0x00; t->flags |= ACPI_EC_COMMAND_COMPLETE; - pr_debug("software QR_EC completion\n"); + pr_debug("***** Command(%s) software completion *****\n", + acpi_ec_cmd_string(t->command)); wakeup = true; } else if ((status & ACPI_EC_FLAG_IBF) == 0) { acpi_ec_write_cmd(ec, t->command); @@ -303,15 +326,16 @@ static int acpi_ec_transaction_unlocked(struct acpi_ec *ec, spin_lock_irqsave(&ec->lock, tmp); /* following two actions should be kept atomic */ ec->curr = t; - pr_debug("transaction start (cmd=0x%02x, addr=0x%02x)\n", - t->command, t->wdata ? t->wdata[0] : 0); + pr_debug("***** Command(%s) started *****\n", + acpi_ec_cmd_string(t->command)); start_transaction(ec); spin_unlock_irqrestore(&ec->lock, tmp); ret = ec_poll(ec); spin_lock_irqsave(&ec->lock, tmp); if (ec->curr->command == ACPI_EC_COMMAND_QUERY) clear_bit(EC_FLAGS_QUERY_PENDING, &ec->flags); - pr_debug("transaction end\n"); + pr_debug("***** Command(%s) stopped *****\n", + acpi_ec_cmd_string(t->command)); ec->curr = NULL; spin_unlock_irqrestore(&ec->lock, tmp); return ret; -- cgit v1.2.3 From d3090b6a6cfa6c7a762d6c13340170ca072b2b81 Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Tue, 14 Oct 2014 14:23:55 +0800 Subject: ACPI / EC: Refine event/query debugging messages. This patch refines event/query debugging messages to use a unified format as commands. Developers can clearly find different processes by checking different log seperators. No functional changes. Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index fcf667c3961e..d0247d38d96f 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -332,8 +332,10 @@ static int acpi_ec_transaction_unlocked(struct acpi_ec *ec, spin_unlock_irqrestore(&ec->lock, tmp); ret = ec_poll(ec); spin_lock_irqsave(&ec->lock, tmp); - if (ec->curr->command == ACPI_EC_COMMAND_QUERY) + if (ec->curr->command == ACPI_EC_COMMAND_QUERY) { clear_bit(EC_FLAGS_QUERY_PENDING, &ec->flags); + pr_debug("***** Event stopped *****\n"); + } pr_debug("***** Command(%s) stopped *****\n", acpi_ec_cmd_string(t->command)); ec->curr = NULL; @@ -617,12 +619,12 @@ static void acpi_ec_run(void *cxt) struct acpi_ec_query_handler *handler = cxt; if (!handler) return; - pr_debug("start query execution\n"); + pr_debug("##### Query(0x%02x) started #####\n", handler->query_bit); if (handler->func) handler->func(handler->data); else if (handler->handle) acpi_evaluate_object(handler->handle, NULL, NULL, NULL); - pr_debug("stop query execution\n"); + pr_debug("##### Query(0x%02x) stopped #####\n", handler->query_bit); kfree(handler); } @@ -645,8 +647,8 @@ static int acpi_ec_sync_query(struct acpi_ec *ec, u8 *data) if (!copy) return -ENOMEM; memcpy(copy, handler, sizeof(*copy)); - pr_debug("push query execution (0x%2x) on queue\n", - value); + pr_debug("##### Query(0x%02x) scheduled #####\n", + handler->query_bit); return acpi_os_execute((copy->func) ? OSL_NOTIFY_HANDLER : OSL_GPE_HANDLER, acpi_ec_run, copy); @@ -669,7 +671,7 @@ static int ec_check_sci(struct acpi_ec *ec, u8 state) { if (state & ACPI_EC_FLAG_SCI) { if (!test_and_set_bit(EC_FLAGS_QUERY_PENDING, &ec->flags)) { - pr_debug("push gpe query to the queue\n"); + pr_debug("***** Event started *****\n"); return acpi_os_execute(OSL_NOTIFY_HANDLER, acpi_ec_gpe_query, ec); } -- cgit v1.2.3 From 7a73e60e398a61612651d503aef9c81260d33918 Mon Sep 17 00:00:00 2001 From: Lv Zheng Date: Tue, 14 Oct 2014 14:24:01 +0800 Subject: ACPI / EC: Cleanup coding style. This patch cleans up the following coding style issues that are detected by scripts/checkpatch.pl: ERROR: code indent should use tabs where possible ERROR: "foo * bar" should be "foo *bar" WARNING: Missing a blank line after declarations WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable WARNING: void function return statements are not generally useful WARNING: else is not generally useful after a break or return WARNING: break is not useful after a goto or return WARNING: braces {} are not necessary for single statement blocks WARNING: line over 80 characters WARNING: msleep < 20ms can sleep for up to 20ms; see Documentation/timers/timers-howto.txt No functional changes. Signed-off-by: Lv Zheng Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 56 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 30 insertions(+), 26 deletions(-) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index d0247d38d96f..3d304ff7f095 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -128,12 +128,13 @@ static int EC_FLAGS_SKIP_DSDT_SCAN; /* Not all BIOS survive early DSDT scan */ static int EC_FLAGS_CLEAR_ON_RESUME; /* Needs acpi_ec_clear() on boot/resume */ /* -------------------------------------------------------------------------- - Transaction Management - -------------------------------------------------------------------------- */ + * Transaction Management + * -------------------------------------------------------------------------- */ static inline u8 acpi_ec_read_status(struct acpi_ec *ec) { u8 x = inb(ec->command_addr); + pr_debug("EC_SC(R) = 0x%2.2x " "SCI_EVT=%d BURST=%d CMD=%d IBF=%d OBF=%d\n", x, @@ -148,6 +149,7 @@ static inline u8 acpi_ec_read_status(struct acpi_ec *ec) static inline u8 acpi_ec_read_data(struct acpi_ec *ec) { u8 x = inb(ec->data_addr); + pr_debug("EC_DATA(R) = 0x%2.2x\n", x); return x; } @@ -189,6 +191,7 @@ static int ec_transaction_completed(struct acpi_ec *ec) { unsigned long flags; int ret = 0; + spin_lock_irqsave(&ec->lock, flags); if (ec->curr && (ec->curr->flags & ACPI_EC_COMMAND_COMPLETE)) ret = 1; @@ -288,6 +291,7 @@ static int ec_poll(struct acpi_ec *ec) { unsigned long flags; int repeat = 5; /* number of command restarts */ + while (repeat--) { unsigned long delay = jiffies + msecs_to_jiffies(ec_delay); @@ -320,6 +324,7 @@ static int acpi_ec_transaction_unlocked(struct acpi_ec *ec, { unsigned long tmp; int ret = 0; + if (EC_FLAGS_MSI) udelay(ACPI_EC_MSI_UDELAY); /* start transaction */ @@ -347,6 +352,7 @@ static int acpi_ec_transaction(struct acpi_ec *ec, struct transaction *t) { int status; u32 glk; + if (!ec || (!t) || (t->wlen && !t->wdata) || (t->rlen && !t->rdata)) return -EINVAL; if (t->rdata) @@ -410,7 +416,7 @@ static int acpi_ec_burst_disable(struct acpi_ec *ec) acpi_ec_transaction(ec, &t) : 0; } -static int acpi_ec_read(struct acpi_ec *ec, u8 address, u8 * data) +static int acpi_ec_read(struct acpi_ec *ec, u8 address, u8 *data) { int result; u8 d; @@ -446,10 +452,9 @@ int ec_read(u8 addr, u8 *val) if (!err) { *val = temp_data; return 0; - } else - return err; + } + return err; } - EXPORT_SYMBOL(ec_read); int ec_write(u8 addr, u8 val) @@ -463,22 +468,21 @@ int ec_write(u8 addr, u8 val) return err; } - EXPORT_SYMBOL(ec_write); int ec_transaction(u8 command, - const u8 * wdata, unsigned wdata_len, - u8 * rdata, unsigned rdata_len) + const u8 *wdata, unsigned wdata_len, + u8 *rdata, unsigned rdata_len) { struct transaction t = {.command = command, .wdata = wdata, .rdata = rdata, .wlen = wdata_len, .rlen = rdata_len}; + if (!first_ec) return -ENODEV; return acpi_ec_transaction(first_ec, &t); } - EXPORT_SYMBOL(ec_transaction); /* Get the handle to the EC device */ @@ -488,7 +492,6 @@ acpi_handle ec_get_handle(void) return NULL; return first_ec->handle; } - EXPORT_SYMBOL(ec_get_handle); /* @@ -552,13 +555,14 @@ void acpi_ec_unblock_transactions_early(void) clear_bit(EC_FLAGS_BLOCKED, &first_ec->flags); } -static int acpi_ec_query_unlocked(struct acpi_ec *ec, u8 * data) +static int acpi_ec_query_unlocked(struct acpi_ec *ec, u8 *data) { int result; u8 d; struct transaction t = {.command = ACPI_EC_COMMAND_QUERY, .wdata = NULL, .rdata = &d, .wlen = 0, .rlen = 1}; + if (!ec || !data) return -EINVAL; /* @@ -584,6 +588,7 @@ int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit, { struct acpi_ec_query_handler *handler = kzalloc(sizeof(struct acpi_ec_query_handler), GFP_KERNEL); + if (!handler) return -ENOMEM; @@ -596,12 +601,12 @@ int acpi_ec_add_query_handler(struct acpi_ec *ec, u8 query_bit, mutex_unlock(&ec->mutex); return 0; } - EXPORT_SYMBOL_GPL(acpi_ec_add_query_handler); void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit) { struct acpi_ec_query_handler *handler, *tmp; + mutex_lock(&ec->mutex); list_for_each_entry_safe(handler, tmp, &ec->list, node) { if (query_bit == handler->query_bit) { @@ -611,12 +616,12 @@ void acpi_ec_remove_query_handler(struct acpi_ec *ec, u8 query_bit) } mutex_unlock(&ec->mutex); } - EXPORT_SYMBOL_GPL(acpi_ec_remove_query_handler); static void acpi_ec_run(void *cxt) { struct acpi_ec_query_handler *handler = cxt; + if (!handler) return; pr_debug("##### Query(0x%02x) started #####\n", handler->query_bit); @@ -660,6 +665,7 @@ static int acpi_ec_sync_query(struct acpi_ec *ec, u8 *data) static void acpi_ec_gpe_query(void *ec_cxt) { struct acpi_ec *ec = ec_cxt; + if (!ec) return; mutex_lock(&ec->mutex); @@ -694,8 +700,8 @@ static u32 acpi_ec_gpe_handler(acpi_handle gpe_device, } /* -------------------------------------------------------------------------- - Address Space Management - -------------------------------------------------------------------------- */ + * Address Space Management + * -------------------------------------------------------------------------- */ static acpi_status acpi_ec_space_handler(u32 function, acpi_physical_address address, @@ -726,27 +732,26 @@ acpi_ec_space_handler(u32 function, acpi_physical_address address, switch (result) { case -EINVAL: return AE_BAD_PARAMETER; - break; case -ENODEV: return AE_NOT_FOUND; - break; case -ETIME: return AE_TIME; - break; default: return AE_OK; } } /* -------------------------------------------------------------------------- - Driver Interface - -------------------------------------------------------------------------- */ + * Driver Interface + * -------------------------------------------------------------------------- */ + static acpi_status ec_parse_io_ports(struct acpi_resource *resource, void *context); static struct acpi_ec *make_acpi_ec(void) { struct acpi_ec *ec = kzalloc(sizeof(struct acpi_ec), GFP_KERNEL); + if (!ec) return NULL; ec->flags = 1 << EC_FLAGS_QUERY_PENDING; @@ -769,9 +774,8 @@ acpi_ec_register_query_methods(acpi_handle handle, u32 level, status = acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer); - if (ACPI_SUCCESS(status) && sscanf(node_name, "_Q%x", &value) == 1) { + if (ACPI_SUCCESS(status) && sscanf(node_name, "_Q%x", &value) == 1) acpi_ec_add_query_handler(ec, value, handle, NULL, NULL); - } return AE_OK; } @@ -780,7 +784,6 @@ ec_parse_device(acpi_handle handle, u32 Level, void *context, void **retval) { acpi_status status; unsigned long long tmp = 0; - struct acpi_ec *ec = context; /* clear addr values, ec_parse_io_ports depend on it */ @@ -808,6 +811,7 @@ ec_parse_device(acpi_handle handle, u32 Level, void *context, void **retval) static int ec_install_handlers(struct acpi_ec *ec) { acpi_status status; + if (test_bit(EC_FLAGS_HANDLERS_INSTALLED, &ec->flags)) return 0; status = acpi_install_gpe_handler(NULL, ec->gpe, @@ -1105,7 +1109,8 @@ int __init acpi_ec_ecdt_probe(void) boot_ec->data_addr = ecdt_ptr->data.address; boot_ec->gpe = ecdt_ptr->gpe; boot_ec->handle = ACPI_ROOT_OBJECT; - acpi_get_handle(ACPI_ROOT_OBJECT, ecdt_ptr->id, &boot_ec->handle); + acpi_get_handle(ACPI_ROOT_OBJECT, ecdt_ptr->id, + &boot_ec->handle); /* Don't trust ECDT, which comes from ASUSTek */ if (!EC_FLAGS_VALIDATE_ECDT) goto install; @@ -1189,6 +1194,5 @@ static void __exit acpi_ec_exit(void) { acpi_bus_unregister_driver(&acpi_ec_driver); - return; } #endif /* 0 */ -- cgit v1.2.3 From 51315cdfa0521fff3059cec5fb8ffecc7f37cba7 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Sun, 19 Oct 2014 11:30:27 +0200 Subject: cpufreq: allow driver-specific data This commit extends the cpufreq_driver structure with an additional 'void *driver_data' field that can be filled by the ->probe() function of a cpufreq driver to pass additional custom information to the driver itself. A new function called cpufreq_get_driver_data() is added to allow a cpufreq driver to retrieve those driver data, since they are typically needed from a cpufreq_policy->init() callback, which does not have access to the cpufreq_driver structure. This function call is similar to the existing cpufreq_get_current_driver() function call. Signed-off-by: Thomas Petazzoni Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 15 +++++++++++++++ include/linux/cpufreq.h | 2 ++ 2 files changed, 17 insertions(+) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 24bf76fba141..058d6e084a6d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1731,6 +1731,21 @@ const char *cpufreq_get_current_driver(void) } EXPORT_SYMBOL_GPL(cpufreq_get_current_driver); +/** + * cpufreq_get_driver_data - return current driver data + * + * Return the private data of the currently loaded cpufreq + * driver, or NULL if no cpufreq driver is loaded. + */ +void *cpufreq_get_driver_data(void) +{ + if (cpufreq_driver) + return cpufreq_driver->driver_data; + + return NULL; +} +EXPORT_SYMBOL_GPL(cpufreq_get_driver_data); + /********************************************************************* * NOTIFIER LISTS INTERFACE * *********************************************************************/ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 138336b6bb04..503b085b7832 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -219,6 +219,7 @@ __ATTR(_name, 0644, show_##_name, store_##_name) struct cpufreq_driver { char name[CPUFREQ_NAME_LEN]; u8 flags; + void *driver_data; /* needed by all drivers */ int (*init) (struct cpufreq_policy *policy); @@ -312,6 +313,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data); int cpufreq_unregister_driver(struct cpufreq_driver *driver_data); const char *cpufreq_get_current_driver(void); +void *cpufreq_get_driver_data(void); static inline void cpufreq_verify_within_limits(struct cpufreq_policy *policy, unsigned int min, unsigned int max) -- cgit v1.2.3 From 34e5a5273d6aa0ee8836bd5d6111b135ffae6931 Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Sun, 19 Oct 2014 11:30:28 +0200 Subject: cpufreq: cpufreq-dt: extend with platform_data This commit extends the cpufreq-dt driver to take a platform_data structure. This structure is for now used to tell the cpufreq-dt driver the layout of the clocks on the platform, i.e whether all CPUs share the same clock or whether each CPU has a separate clock. Signed-off-by: Thomas Petazzoni Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 17 +++++++++++++++-- include/linux/cpufreq-dt.h | 22 ++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 include/linux/cpufreq-dt.h diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 6bbb8b913446..52facdaf531e 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -178,6 +179,7 @@ try_again: static int cpufreq_init(struct cpufreq_policy *policy) { + struct cpufreq_dt_platform_data *pd; struct cpufreq_frequency_table *freq_table; struct thermal_cooling_device *cdev; struct device_node *np; @@ -265,9 +267,18 @@ static int cpufreq_init(struct cpufreq_policy *policy) policy->driver_data = priv; policy->clk = cpu_clk; - ret = cpufreq_generic_init(policy, freq_table, transition_latency); - if (ret) + ret = cpufreq_table_validate_and_show(policy, freq_table); + if (ret) { + dev_err(cpu_dev, "%s: invalid frequency table: %d\n", __func__, + ret); goto out_cooling_unregister; + } + + policy->cpuinfo.transition_latency = transition_latency; + + pd = cpufreq_get_driver_data(); + if (pd && !pd->independent_clocks) + cpumask_setall(policy->cpus); of_node_put(np); @@ -335,6 +346,8 @@ static int dt_cpufreq_probe(struct platform_device *pdev) if (!IS_ERR(cpu_reg)) regulator_put(cpu_reg); + dt_cpufreq_driver.driver_data = dev_get_platdata(&pdev->dev); + ret = cpufreq_register_driver(&dt_cpufreq_driver); if (ret) dev_err(cpu_dev, "failed register driver: %d\n", ret); diff --git a/include/linux/cpufreq-dt.h b/include/linux/cpufreq-dt.h new file mode 100644 index 000000000000..0414009e2c30 --- /dev/null +++ b/include/linux/cpufreq-dt.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2014 Marvell + * Thomas Petazzoni + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __CPUFREQ_DT_H__ +#define __CPUFREQ_DT_H__ + +struct cpufreq_dt_platform_data { + /* + * True when each CPU has its own clock to control its + * frequency, false when all CPUs are controlled by a single + * clock. + */ + bool independent_clocks; +}; + +#endif /* __CPUFREQ_DT_H__ */ -- cgit v1.2.3 From a00de1ab21acbd150db341cb56f1897550d6688c Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Sun, 19 Oct 2014 11:30:29 +0200 Subject: cpufreq: cpufreq-dt: adjust message related to regulators The cpufreq-dt driver tries to get a regulator for each CPU. This regulator is optional, but when not present, a scary message "failed to get cpuX regulator" is displayed. To solve this, we reduce the severity of the message from dev_warn() to dev_dbg() and we reword the message to not be as scary. Signed-off-by: Thomas Petazzoni Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq-dt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index 52facdaf531e..92c162af5045 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -147,8 +147,8 @@ try_again: goto try_again; } - dev_warn(cpu_dev, "failed to get cpu%d regulator: %ld\n", - cpu, PTR_ERR(cpu_reg)); + dev_dbg(cpu_dev, "no regulator for cpu%d: %ld\n", + cpu, PTR_ERR(cpu_reg)); } cpu_clk = clk_get(cpu_dev, NULL); -- cgit v1.2.3 From 74aa51b5ccd3975392e30d11820dc073c5f2cd32 Mon Sep 17 00:00:00 2001 From: "Preeti U. Murthy" Date: Tue, 14 Oct 2014 13:23:00 +0530 Subject: cpuidle: powernv: Populate cpuidle state details by querying the device-tree We hard code the metrics relevant for cpuidle states in the kernel today. Instead pick them up from the device tree so that they remain relevant and updated for the system that the kernel is running on. Signed-off-by: Preeti U. Murthy Signed-off-by: Shreyas B. Prabhu Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/cpuidle-powernv.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index a64be578dab2..7d3a3497dd4c 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -163,7 +163,8 @@ static int powernv_add_idle_states(void) int nr_idle_states = 1; /* Snooze */ int dt_idle_states; const __be32 *idle_state_flags; - u32 len_flags, flags; + const __be32 *idle_state_latency; + u32 len_flags, flags, latency_ns; int i; /* Currently we have snooze statically defined */ @@ -180,18 +181,32 @@ static int powernv_add_idle_states(void) return nr_idle_states; } + idle_state_latency = of_get_property(power_mgt, + "ibm,cpu-idle-state-latencies-ns", NULL); + if (!idle_state_latency) { + pr_warn("DT-PowerMgmt: missing ibm,cpu-idle-state-latencies-ns\n"); + return nr_idle_states; + } + dt_idle_states = len_flags / sizeof(u32); for (i = 0; i < dt_idle_states; i++) { flags = be32_to_cpu(idle_state_flags[i]); + + /* Cpuidle accepts exit_latency in us and we estimate + * target residency to be 10x exit_latency + */ + latency_ns = be32_to_cpu(idle_state_latency[i]); if (flags & IDLE_USE_INST_NAP) { /* Add NAP state */ strcpy(powernv_states[nr_idle_states].name, "Nap"); strcpy(powernv_states[nr_idle_states].desc, "Nap"); powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIME_VALID; - powernv_states[nr_idle_states].exit_latency = 10; - powernv_states[nr_idle_states].target_residency = 100; + powernv_states[nr_idle_states].exit_latency = + ((unsigned int)latency_ns) / 1000; + powernv_states[nr_idle_states].target_residency = + ((unsigned int)latency_ns / 100); powernv_states[nr_idle_states].enter = &nap_loop; nr_idle_states++; } @@ -202,8 +217,10 @@ static int powernv_add_idle_states(void) strcpy(powernv_states[nr_idle_states].desc, "FastSleep"); powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TIMER_STOP; - powernv_states[nr_idle_states].exit_latency = 300; - powernv_states[nr_idle_states].target_residency = 1000000; + powernv_states[nr_idle_states].exit_latency = + ((unsigned int)latency_ns) / 1000; + powernv_states[nr_idle_states].target_residency = + ((unsigned int)latency_ns / 100); powernv_states[nr_idle_states].enter = &fastsleep_loop; nr_idle_states++; } -- cgit v1.2.3 From a5466d7bba9af83a82cc7c081b2a7d557cde3204 Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Tue, 21 Oct 2014 10:21:54 +0100 Subject: MIPS: cp1emu: Fix ISA restrictions for cop1x_op instructions Commit 08a07904e1828 ("MIPS: math-emu: Remove most ifdefery") removed the #ifdef ISA conditions and switched to runtime detection. However, according to the instruction set manual, the cop1x_op instructions are available in >=MIPS32r2 as well. This fixes a problem on MIPS32r2 with the ntpd package which failed to execute with a SIGILL exit code due to the fact that a madd.d instruction was not being emulated. Signed-off-by: Markos Chandras Fixes: 08a07904e1828 ("MIPS: math-emu: Remove most ifdefery") Cc: # v3.16+ Cc: linux-mips@linux-mips.org Reviewed-by: Paul Burton Reviewed-by: James Hogan Cc: Markos Chandras Patchwork: https://patchwork.linux-mips.org/patch/8173/ Signed-off-by: Ralf Baechle --- arch/mips/math-emu/cp1emu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index 7a4727795a70..51a0fde4bec1 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -1023,7 +1023,7 @@ emul: goto emul; case cop1x_op: - if (cpu_has_mips_4_5 || cpu_has_mips64) + if (cpu_has_mips_4_5 || cpu_has_mips64 || cpu_has_mips32r2) /* its one of ours */ goto emul; @@ -1068,7 +1068,7 @@ emul: break; case cop1x_op: - if (!cpu_has_mips_4_5 && !cpu_has_mips64) + if (!cpu_has_mips_4_5 && !cpu_has_mips64 && !cpu_has_mips32r2) return SIGILL; sig = fpux_emu(xcp, ctx, ir, fault_addr); -- cgit v1.2.3 From 507a369e126bf094ffcef77735a56e25a61e9041 Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Tue, 21 Oct 2014 13:23:11 +0100 Subject: MIPS: Lasat: Add missing CONFIG_PROC_FS dependency to PICVUE_PROC The picvue_proc.c file creates the /proc interface for the PICVUE LCD display driver. As a result of which, it needs to depend on the PROC_FS symbol to avoid build problems like the following one when CONFIG_PROC_FS is not enabled. arch/mips/lasat/picvue_proc.c:26:14: error: 'pvc_linename' defined but not used [-Werror=unused-variable] static char *pvc_linename[PVC_NLINES] = {"line1", "line2"}; ^ Signed-off-by: Markos Chandras Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/8174/ Signed-off-by: Ralf Baechle --- arch/mips/lasat/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/lasat/Kconfig b/arch/mips/lasat/Kconfig index 1d2ee8a9be13..8776d0a34274 100644 --- a/arch/mips/lasat/Kconfig +++ b/arch/mips/lasat/Kconfig @@ -4,7 +4,7 @@ config PICVUE config PICVUE_PROC tristate "PICVUE LCD display driver /proc interface" - depends on PICVUE + depends on PICVUE && PROC_FS config DS1603 bool "DS1603 RTC driver" -- cgit v1.2.3 From ceab3fe69408cb98f437dad3b4b4bb79434370ef Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 21 Oct 2014 17:01:07 +0100 Subject: arm64: Fix compilation error on UP builds In file included from ./arch/arm64/include/asm/irq_work.h:4:0, from include/linux/irq_work.h:46, from include/linux/perf_event.h:49, from include/linux/ftrace_event.h:9, from include/trace/syscall.h:6, from include/linux/syscalls.h:81, from init/main.c:18: ./arch/arm64/include/asm/smp.h:24:3: error: #error " included in non-SMP build" # error " included in non-SMP build" Signed-off-by: Catalin Marinas Fixes: 3631073659d0 ("arm64: Tell irq work about self IPI support") Reported-by: Guenter Roeck Tested-by: Guenter Roeck --- arch/arm64/include/asm/irq_work.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/arm64/include/asm/irq_work.h b/arch/arm64/include/asm/irq_work.h index 8e24ef3f7c82..b4f6b19a8a68 100644 --- a/arch/arm64/include/asm/irq_work.h +++ b/arch/arm64/include/asm/irq_work.h @@ -1,6 +1,8 @@ #ifndef __ASM_IRQ_WORK_H #define __ASM_IRQ_WORK_H +#ifdef CONFIG_SMP + #include static inline bool arch_irq_work_has_interrupt(void) @@ -8,4 +10,13 @@ static inline bool arch_irq_work_has_interrupt(void) return !!__smp_cross_call; } +#else + +static inline bool arch_irq_work_has_interrupt(void) +{ + return false; +} + +#endif + #endif /* __ASM_IRQ_WORK_H */ -- cgit v1.2.3 From dde1c652d7b1e69a3f1959697f039ad6dfaeb5dd Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 21 Oct 2014 15:32:13 +0200 Subject: ALSA: pcm: Fix false lockdep warnings As PCM core handles the multiple linked streams in parallel, lockdep gets confused (partly because of weak annotations) and spews the false-positive warnings. This hasn't been a problem for long time but the latest PCM lock path update seems to have woken up a sleeping dog. Here is an attempt to paper over this issue: pass the lock subclass just calculated from the depth in snd_pcm_action_group(). Also, a (possibly) wrong lock subclass set in snd_pcm_action_lock_mutex() is dropped, too. Reported-and-tested-by: Arthur Marsh Signed-off-by: Takashi Iwai --- sound/core/pcm_native.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 815396d8427f..166d59cdc86b 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -781,16 +781,15 @@ static int snd_pcm_action_group(struct action_ops *ops, { struct snd_pcm_substream *s = NULL; struct snd_pcm_substream *s1; - int res = 0; + int res = 0, depth = 1; snd_pcm_group_for_each_entry(s, substream) { if (do_lock && s != substream) { if (s->pcm->nonatomic) - mutex_lock_nested(&s->self_group.mutex, - SINGLE_DEPTH_NESTING); + mutex_lock_nested(&s->self_group.mutex, depth); else - spin_lock_nested(&s->self_group.lock, - SINGLE_DEPTH_NESTING); + spin_lock_nested(&s->self_group.lock, depth); + depth++; } res = ops->pre_action(s, state); if (res < 0) @@ -906,8 +905,7 @@ static int snd_pcm_action_lock_mutex(struct action_ops *ops, down_read(&snd_pcm_link_rwsem); if (snd_pcm_stream_linked(substream)) { mutex_lock(&substream->group->mutex); - mutex_lock_nested(&substream->self_group.mutex, - SINGLE_DEPTH_NESTING); + mutex_lock(&substream->self_group.mutex); res = snd_pcm_action_group(ops, substream, state, 1); mutex_unlock(&substream->self_group.mutex); mutex_unlock(&substream->group->mutex); -- cgit v1.2.3 From 8a2f38ddfeb526c30b3ec209468172a30a38d996 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Wed, 24 Sep 2014 11:00:37 +0300 Subject: ACPI / platform: provide default DMA mask Most devices are configured for 32-bit DMA addresses. Setting the mask to 32-bit here removes the need for the drivers to do it separately. Signed-off-by: Heikki Krogerus Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_platform.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c index 2bf9082f7523..8d099e636b15 100644 --- a/drivers/acpi/acpi_platform.c +++ b/drivers/acpi/acpi_platform.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "internal.h" @@ -102,6 +103,7 @@ struct platform_device *acpi_create_platform_device(struct acpi_device *adev) pdevinfo.res = resources; pdevinfo.num_res = count; pdevinfo.acpi_node.companion = adev; + pdevinfo.dma_mask = DMA_BIT_MASK(32); pdev = platform_device_register_full(&pdevinfo); if (IS_ERR(pdev)) dev_err(&adev->dev, "platform device creation failed: %ld\n", -- cgit v1.2.3 From 51fae6da640edf9d266c94f36bc806c63c301991 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 21 Oct 2014 09:27:12 +0200 Subject: freezer: Do not freeze tasks killed by OOM killer Since f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) OOM killer relies on being able to thaw a frozen task to handle OOM situation but a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE) has reorganized the code and stopped clearing freeze flag in __thaw_task. This means that the target task only wakes up and goes into the fridge again because the freezing condition hasn't changed for it. This reintroduces the bug fixed by f660daac474c6f. Fix the issue by checking for TIF_MEMDIE thread flag in freezing_slow_path and exclude the task from freezing completely. If a task was already frozen it would get woken by __thaw_task from OOM killer and get out of freezer after rechecking freezing(). Changes since v1 - put TIF_MEMDIE check into freezing_slowpath rather than in __refrigerator as per Oleg - return __thaw_task into oom_scan_process_thread because oom_kill_process will not wake task in the fridge because it is sleeping uninterruptible [mhocko@suse.cz: rewrote the changelog] Fixes: a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE) Cc: 3.3+ # 3.3+ Signed-off-by: Cong Wang Signed-off-by: Michal Hocko Acked-by: Oleg Nesterov Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/freezer.c b/kernel/freezer.c index aa6a8aadb911..8f9279b9c6d7 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p) if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) return false; + if (test_thread_flag(TIF_MEMDIE)) + return false; + if (pm_nosig_freezing || cgroup_freezing(p)) return true; -- cgit v1.2.3 From c05eb32f472fb9f7f474c20ff6fa5bfe0cbedc05 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 21 Oct 2014 09:27:13 +0200 Subject: freezer: remove obsolete comments in __thaw_task() __thaw_task() no longer clears frozen flag since commit a3201227f803 (freezer: make freezing() test freeze conditions in effect instead of TIF_FREEZE). Reviewed-by: Michal Hocko Signed-off-by: Cong Wang Signed-off-by: Rafael J. Wysocki --- kernel/freezer.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/kernel/freezer.c b/kernel/freezer.c index 8f9279b9c6d7..a8900a3bc27a 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -150,12 +150,6 @@ void __thaw_task(struct task_struct *p) { unsigned long flags; - /* - * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to - * be visible to @p as waking up implies wmb. Waking up inside - * freezer_lock also prevents wakeups from leaking outside - * refrigerator. - */ spin_lock_irqsave(&freezer_lock, flags); if (frozen(p)) wake_up_process(p); -- cgit v1.2.3 From 5695be142e203167e3cb515ef86a88424f3524eb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 20 Oct 2014 18:12:32 +0200 Subject: OOM, PM: OOM killed task shouldn't escape PM suspend PM freezer relies on having all tasks frozen by the time devices are getting frozen so that no task will touch them while they are getting frozen. But OOM killer is allowed to kill an already frozen task in order to handle OOM situtation. In order to protect from late wake ups OOM killer is disabled after all tasks are frozen. This, however, still keeps a window open when a killed task didn't manage to die by the time freeze_processes finishes. Reduce the race window by checking all tasks after OOM killer has been disabled. This is still not race free completely unfortunately because oom_killer_disable cannot stop an already ongoing OOM killer so a task might still wake up from the fridge and get killed without freeze_processes noticing. Full synchronization of OOM and freezer is, however, too heavy weight for this highly unlikely case. Introduce and check oom_kills counter which gets incremented early when the allocator enters __alloc_pages_may_oom path and only check all the tasks if the counter changes during the freezing attempt. The counter is updated so early to reduce the race window since allocator checked oom_killer_disabled which is set by PM-freezing code. A false positive will push the PM-freezer into a slow path but that is not a big deal. Changes since v1 - push the re-check loop out of freeze_processes into check_frozen_processes and invert the condition to make the code more readable as per Rafael Fixes: f660daac474c6f (oom: thaw threads if oom killed thread is frozen before deferring) Cc: 3.2+ # 3.2+ Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- include/linux/oom.h | 3 +++ kernel/power/process.c | 40 +++++++++++++++++++++++++++++++++++++++- mm/oom_kill.c | 17 +++++++++++++++++ mm/page_alloc.c | 8 ++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 647395a1a550..e8d6e1058723 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -50,6 +50,9 @@ static inline bool oom_task_origin(const struct task_struct *p) extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); + +extern int oom_kills_count(void); +extern void note_oom_kill(void); extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, nodemask_t *nodemask, diff --git a/kernel/power/process.c b/kernel/power/process.c index 7b323221b9ee..5cc588c1abab 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,6 +108,28 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +/* + * Returns true if all freezable tasks (except for current) are frozen already + */ +static bool check_frozen_processes(void) +{ + struct task_struct *g, *p; + bool ret = true; + + read_lock(&tasklist_lock); + for_each_process_thread(g, p) { + if (p != current && !freezer_should_skip(p) && + !frozen(p)) { + ret = false; + goto done; + } + } +done: + read_unlock(&tasklist_lock); + + return ret; +} + /** * freeze_processes - Signal user space processes to enter the refrigerator. * The current thread will not be frozen. The same process that calls @@ -118,6 +140,7 @@ static int try_to_freeze_tasks(bool user_only) int freeze_processes(void) { int error; + int oom_kills_saved; error = __usermodehelper_disable(UMH_FREEZING); if (error) @@ -132,12 +155,27 @@ int freeze_processes(void) pm_wakeup_clear(); printk("Freezing user space processes ... "); pm_freezing = true; + oom_kills_saved = oom_kills_count(); error = try_to_freeze_tasks(true); if (!error) { - printk("done."); __usermodehelper_set_disable_depth(UMH_DISABLED); oom_killer_disable(); + + /* + * There might have been an OOM kill while we were + * freezing tasks and the killed task might be still + * on the way out so we have to double check for race. + */ + if (oom_kills_count() != oom_kills_saved && + !check_frozen_processes()) { + __usermodehelper_set_disable_depth(UMH_ENABLED); + printk("OOM in progress."); + error = -EBUSY; + goto done; + } + printk("done."); } +done: printk("\n"); BUG_ON(in_atomic()); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index bbf405a3a18f..5340f6b91312 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -404,6 +404,23 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, dump_tasks(memcg, nodemask); } +/* + * Number of OOM killer invocations (including memcg OOM killer). + * Primarily used by PM freezer to check for potential races with + * OOM killed frozen task. + */ +static atomic_t oom_kills = ATOMIC_INIT(0); + +int oom_kills_count(void) +{ + return atomic_read(&oom_kills); +} + +void note_oom_kill(void) +{ + atomic_inc(&oom_kills); +} + #define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 736d8e1b6381..9cd36b822444 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2251,6 +2251,14 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, return NULL; } + /* + * PM-freezer should be notified that there might be an OOM killer on + * its way to kill and wake somebody up. This is too early and we might + * end up not killing anything but false positives are acceptable. + * See freeze_processes. + */ + note_oom_kill(); + /* * Go through the zonelist yet one more time, keep very high watermark * here, this is only to catch a parallel oom killing, we must fail if -- cgit v1.2.3 From a28e785a9f794ba32e603570ab52a262cf963489 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Oct 2014 09:27:15 +0200 Subject: PM: convert do_each_thread to for_each_process_thread as per 0c740d0afc3b (introduce for_each_thread() to replace the buggy while_each_thread()) get rid of do_each_thread { } while_each_thread() construct and replace it by a more error prone for_each_thread. This patch doesn't introduce any user visible change. Suggested-by: Oleg Nesterov Signed-off-by: Michal Hocko Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/power/process.c b/kernel/power/process.c index 5cc588c1abab..7f0d4343af1b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only) while (true) { todo = 0; read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p == current || !freeze_task(p)) continue; if (!freezer_should_skip(p)) todo++; - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); if (!user_only) { @@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only) if (!wakeup) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) && freezing(p) && !frozen(p)) sched_show_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); } } else { @@ -229,11 +229,11 @@ void thaw_processes(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); @@ -256,10 +256,10 @@ void thaw_kernel_threads(void) thaw_workqueues(); read_lock(&tasklist_lock); - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) __thaw_task(p); - } while_each_thread(g, p); + } read_unlock(&tasklist_lock); schedule(); -- cgit v1.2.3 From c572aaf46f71f63ae5914d4e194a955e0ba1b519 Mon Sep 17 00:00:00 2001 From: Marc-André Lureau Date: Thu, 16 Oct 2014 11:39:44 +0200 Subject: qxl: don't create too large primary surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Limit primary to qemu vgamem size, to avoid reaching qemu guest bug "requested primary larger than framebuffer" on resizing screen too large to fit. Remove unneeded and misleading variables. Related to: https://bugzilla.redhat.com/show_bug.cgi?id=1127552 Signed-off-by: Marc-André Lureau Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/qxl/qxl_display.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/qxl/qxl_display.c b/drivers/gpu/drm/qxl/qxl_display.c index af9e78546688..0d1396266857 100644 --- a/drivers/gpu/drm/qxl/qxl_display.c +++ b/drivers/gpu/drm/qxl/qxl_display.c @@ -572,7 +572,6 @@ static int qxl_crtc_mode_set(struct drm_crtc *crtc, struct qxl_framebuffer *qfb; struct qxl_bo *bo, *old_bo = NULL; struct qxl_crtc *qcrtc = to_qxl_crtc(crtc); - uint32_t width, height, base_offset; bool recreate_primary = false; int ret; int surf_id; @@ -602,9 +601,10 @@ static int qxl_crtc_mode_set(struct drm_crtc *crtc, if (qcrtc->index == 0) recreate_primary = true; - width = mode->hdisplay; - height = mode->vdisplay; - base_offset = 0; + if (bo->surf.stride * bo->surf.height > qdev->vram_size) { + DRM_ERROR("Mode doesn't fit in vram size (vgamem)"); + return -EINVAL; + } ret = qxl_bo_reserve(bo, false); if (ret != 0) @@ -618,10 +618,10 @@ static int qxl_crtc_mode_set(struct drm_crtc *crtc, if (recreate_primary) { qxl_io_destroy_primary(qdev); qxl_io_log(qdev, - "recreate primary: %dx%d (was %dx%d,%d,%d)\n", - width, height, bo->surf.width, - bo->surf.height, bo->surf.stride, bo->surf.format); - qxl_io_create_primary(qdev, base_offset, bo); + "recreate primary: %dx%d,%d,%d\n", + bo->surf.width, bo->surf.height, + bo->surf.stride, bo->surf.format); + qxl_io_create_primary(qdev, 0, bo); bo->is_primary = true; } -- cgit v1.2.3 From e2b6b35ee77522c2e15e770aded0b05c25ca0616 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 4 Oct 2012 14:22:23 +0100 Subject: arm64: vexpress: Add CLCD support to the ARMv8 model platform This patch enables CLCD support for the VE platform emulated by the ARMv8 software model (DT bindings are based on Pawel's vexpress patches) together with defconfig entries for SERIO_AMBAKMI and FB_ARMCLCD. Signed-off-by: Catalin Marinas Acked-by: Pawel Moll --- arch/arm64/boot/dts/rtsm_ve-motherboard.dtsi | 35 +++++++++++++++++++++++++++- arch/arm64/configs/defconfig | 2 ++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/rtsm_ve-motherboard.dtsi b/arch/arm64/boot/dts/rtsm_ve-motherboard.dtsi index ac2cb2418025..c46cbb29f3c6 100644 --- a/arch/arm64/boot/dts/rtsm_ve-motherboard.dtsi +++ b/arch/arm64/boot/dts/rtsm_ve-motherboard.dtsi @@ -22,7 +22,7 @@ bank-width = <4>; }; - vram@2,00000000 { + v2m_video_ram: vram@2,00000000 { compatible = "arm,vexpress-vram"; reg = <2 0x00000000 0x00800000>; }; @@ -179,9 +179,42 @@ clcd@1f0000 { compatible = "arm,pl111", "arm,primecell"; reg = <0x1f0000 0x1000>; + interrupt-names = "combined"; interrupts = <14>; clocks = <&v2m_oscclk1>, <&v2m_clk24mhz>; clock-names = "clcdclk", "apb_pclk"; + arm,pl11x,framebuffer = <0x18000000 0x00180000>; + memory-region = <&v2m_video_ram>; + max-memory-bandwidth = <130000000>; /* 16bpp @ 63.5MHz */ + + port { + v2m_clcd_pads: endpoint { + remote-endpoint = <&v2m_clcd_panel>; + arm,pl11x,tft-r0g0b0-pads = <0 8 16>; + }; + }; + + panel { + compatible = "panel-dpi"; + + port { + v2m_clcd_panel: endpoint { + remote-endpoint = <&v2m_clcd_pads>; + }; + }; + + panel-timing { + clock-frequency = <63500127>; + hactive = <1024>; + hback-porch = <152>; + hfront-porch = <48>; + hsync-len = <104>; + vactive = <768>; + vback-porch = <23>; + vfront-porch = <3>; + vsync-len = <4>; + }; + }; }; virtio_block@0130000 { diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 9cd37de9aa8d..4ce602c2c6de 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -78,6 +78,7 @@ CONFIG_NET_XGENE=y # CONFIG_WLAN is not set CONFIG_INPUT_EVDEV=y # CONFIG_SERIO_SERPORT is not set +CONFIG_SERIO_AMBAKMI=y CONFIG_LEGACY_PTY_COUNT=16 CONFIG_SERIAL_8250=y CONFIG_SERIAL_8250_CONSOLE=y @@ -90,6 +91,7 @@ CONFIG_VIRTIO_CONSOLE=y CONFIG_REGULATOR=y CONFIG_REGULATOR_FIXED_VOLTAGE=y CONFIG_FB=y +CONFIG_FB_ARMCLCD=y CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set -- cgit v1.2.3 From 5a1e73ffe57a86a196c08dffcbd9866776594a38 Mon Sep 17 00:00:00 2001 From: Varka Bhadram Date: Wed, 22 Oct 2014 09:31:15 +0530 Subject: pci: pci-lantiq: remove duplicate check on resource Sanity check on resource happening with devm_ioremap_resource() Signed-off-by: Varka Bhadram Acked-by: John Crispin Cc: linux-mips@linux-mips.org Cc: Varka Bhadram Patchwork: https://patchwork.linux-mips.org/patch/8199/ Signed-off-by: Ralf Baechle --- arch/mips/pci/pci-lantiq.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/mips/pci/pci-lantiq.c b/arch/mips/pci/pci-lantiq.c index 37fe8e7887e2..d3ed15b2b2d1 100644 --- a/arch/mips/pci/pci-lantiq.c +++ b/arch/mips/pci/pci-lantiq.c @@ -215,17 +215,12 @@ static int ltq_pci_probe(struct platform_device *pdev) pci_clear_flags(PCI_PROBE_ONLY); - res_cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); res_bridge = platform_get_resource(pdev, IORESOURCE_MEM, 1); - if (!res_cfg || !res_bridge) { - dev_err(&pdev->dev, "missing memory resources\n"); - return -EINVAL; - } - ltq_pci_membase = devm_ioremap_resource(&pdev->dev, res_bridge); if (IS_ERR(ltq_pci_membase)) return PTR_ERR(ltq_pci_membase); + res_cfg = platform_get_resource(pdev, IORESOURCE_MEM, 0); ltq_pci_mapped_cfg = devm_ioremap_resource(&pdev->dev, res_cfg); if (IS_ERR(ltq_pci_mapped_cfg)) return PTR_ERR(ltq_pci_mapped_cfg); -- cgit v1.2.3 From aa08ed55442ac6f9810c055e1474be34e785e556 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Sun, 21 Sep 2014 15:38:43 +0300 Subject: MIPS: loongson2_cpufreq: Fix CPU clock rate setting mismerge During 3.16 merge window, parts of the commit 8e8acb32960f (MIPS/loongson2_cpufreq: Fix CPU clock rate setting) seem to have been deleted probably due to a mismerge, and as a result cpufreq is broken again on Loongson2 boards in 3.16 and newer kernels. Fix by repeating the fix. Signed-off-by: Aaro Koskinen Cc: stable@vger.kernel.org # 3.16 Cc: Rafael J. Wysocki Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/7835/ Signed-off-by: Ralf Baechle --- arch/mips/loongson/lemote-2f/clock.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/mips/loongson/lemote-2f/clock.c b/arch/mips/loongson/lemote-2f/clock.c index a217061beee3..462e34d46b4a 100644 --- a/arch/mips/loongson/lemote-2f/clock.c +++ b/arch/mips/loongson/lemote-2f/clock.c @@ -91,6 +91,7 @@ EXPORT_SYMBOL(clk_put); int clk_set_rate(struct clk *clk, unsigned long rate) { + unsigned int rate_khz = rate / 1000; struct cpufreq_frequency_table *pos; int ret = 0; int regval; @@ -107,9 +108,9 @@ int clk_set_rate(struct clk *clk, unsigned long rate) propagate_rate(clk); cpufreq_for_each_valid_entry(pos, loongson2_clockmod_table) - if (rate == pos->frequency) + if (rate_khz == pos->frequency) break; - if (rate != pos->frequency) + if (rate_khz != pos->frequency) return -ENOTSUPP; clk->rate = rate; -- cgit v1.2.3 From 9e0f162a36914937a937358fcb45e0609ef2bfc4 Mon Sep 17 00:00:00 2001 From: David Daney Date: Mon, 20 Oct 2014 15:34:23 -0700 Subject: MIPS: tlbex: Properly fix HUGE TLB Refill exception handler In commit 8393c524a25609 (MIPS: tlbex: Fix a missing statement for HUGETLB), the TLB Refill handler was fixed so that non-OCTEON targets would work properly with huge pages. The change was incorrect in that it broke the OCTEON case. The problem is shown here: xxx0: df7a0000 ld k0,0(k1) . . . xxxc0: df610000 ld at,0(k1) xxxc4: 335a0ff0 andi k0,k0,0xff0 xxxc8: e825ffcd bbit1 at,0x5,0x0 xxxcc: 003ad82d daddu k1,at,k0 . . . In the non-octeon case there is a destructive test for the huge PTE bit, and then at 0, $k0 is reloaded (that is what the 8393c524a25609 patch added). In the octeon case, we modify k1 in the branch delay slot, but we never need k0 again, so the new load is not needed, but since k1 is modified, if we do the load, we load from a garbage location and then get a nested TLB Refill, which is seen in userspace as either SIGBUS or SIGSEGV (depending on the garbage). The real fix is to only do this reloading if it is needed, and never where it is harmful. Signed-off-by: David Daney Cc: Huacai Chen Cc: Fuxin Zhang Cc: Zhangjin Wu Cc: stable@vger.kernel.org Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/8151/ Signed-off-by: Ralf Baechle --- arch/mips/mm/tlbex.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index a08dd53a1cc5..b5f228e7eae6 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -1062,6 +1062,7 @@ static void build_update_entries(u32 **p, unsigned int tmp, unsigned int ptep) struct mips_huge_tlb_info { int huge_pte; int restore_scratch; + bool need_reload_pte; }; static struct mips_huge_tlb_info @@ -1076,6 +1077,7 @@ build_fast_tlb_refill_handler (u32 **p, struct uasm_label **l, rv.huge_pte = scratch; rv.restore_scratch = 0; + rv.need_reload_pte = false; if (check_for_high_segbits) { UASM_i_MFC0(p, tmp, C0_BADVADDR); @@ -1264,6 +1266,7 @@ static void build_r4000_tlb_refill_handler(void) } else { htlb_info.huge_pte = K0; htlb_info.restore_scratch = 0; + htlb_info.need_reload_pte = true; vmalloc_mode = refill_noscratch; /* * create the plain linear handler @@ -1300,7 +1303,8 @@ static void build_r4000_tlb_refill_handler(void) } #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT uasm_l_tlb_huge_update(&l, p); - UASM_i_LW(&p, K0, 0, K1); + if (htlb_info.need_reload_pte) + UASM_i_LW(&p, htlb_info.huge_pte, 0, K1); build_huge_update_entries(&p, htlb_info.huge_pte, K1); build_huge_tlb_write_entry(&p, &l, &r, K0, tlb_random, htlb_info.restore_scratch); -- cgit v1.2.3 From b61cd31ee6cbe65f67a6ccba2bcbd3c702f52778 Mon Sep 17 00:00:00 2001 From: David Daney Date: Mon, 20 Oct 2014 15:44:24 -0700 Subject: MIPS: Octeon: Remove special case for simulator command line. There is no reason to have the kernel to append commands when running under the simulator, the simulator is perfectly capable of supplying the necessary command line arguments. Furthermore, if the simulator needs something different than what is hard coded in the kernel, it cannot get it if the kernel overrides it. Fix/Simplify the whole thing by removing this bit. Signed-off-by: David Daney Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/8152/ Signed-off-by: Ralf Baechle --- arch/mips/cavium-octeon/setup.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/arch/mips/cavium-octeon/setup.c b/arch/mips/cavium-octeon/setup.c index 38f4c32e2816..5ebdb32d9a2b 100644 --- a/arch/mips/cavium-octeon/setup.c +++ b/arch/mips/cavium-octeon/setup.c @@ -806,15 +806,6 @@ void __init prom_init(void) #endif } - if (octeon_is_simulation()) { - /* - * The simulator uses a mtdram device pre filled with - * the filesystem. Also specify the calibration delay - * to avoid calculating it every time. - */ - strcat(arcs_cmdline, " rw root=1f00 slram=root,0x40000000,+1073741824"); - } - mips_hpt_frequency = octeon_get_clock_rate(); octeon_init_cvmcount(); -- cgit v1.2.3 From 71be2114a5474a76edad95343d89b8731457fccd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 22 Oct 2014 22:47:32 +0200 Subject: PM / freezer: Clean up code after recent fixes Clean up the code in process.c after recent changes to get rid of unnecessary labels and goto statements. Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/kernel/power/process.c b/kernel/power/process.c index 7f0d4343af1b..5a6ec8678b9a 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -108,25 +108,27 @@ static int try_to_freeze_tasks(bool user_only) return todo ? -EBUSY : 0; } +static bool __check_frozen_processes(void) +{ + struct task_struct *g, *p; + + for_each_process_thread(g, p) + if (p != current && !freezer_should_skip(p) && !frozen(p)) + return false; + + return true; +} + /* * Returns true if all freezable tasks (except for current) are frozen already */ static bool check_frozen_processes(void) { - struct task_struct *g, *p; - bool ret = true; + bool ret; read_lock(&tasklist_lock); - for_each_process_thread(g, p) { - if (p != current && !freezer_should_skip(p) && - !frozen(p)) { - ret = false; - goto done; - } - } -done: + ret = __check_frozen_processes(); read_unlock(&tasklist_lock); - return ret; } @@ -167,15 +169,14 @@ int freeze_processes(void) * on the way out so we have to double check for race. */ if (oom_kills_count() != oom_kills_saved && - !check_frozen_processes()) { + !check_frozen_processes()) { __usermodehelper_set_disable_depth(UMH_ENABLED); printk("OOM in progress."); error = -EBUSY; - goto done; + } else { + printk("done."); } - printk("done."); } -done: printk("\n"); BUG_ON(in_atomic()); -- cgit v1.2.3 From cd3d6438b4cdbe39fe8daf1a0abeebabcf9521d5 Mon Sep 17 00:00:00 2001 From: Stefan Hengelein Date: Mon, 20 Oct 2014 18:33:39 +0200 Subject: MIPS: MSP71xx: Remove compilation error when CONFIG_MIPS_MT is present When CONFIG_MIPS_MT is defined, code is enabled that tries to call 'set_vi_handler()'. This function is declared in but the header is never included. Therefore, the compilation breaks. arch/mips/pmcs-msp71xx/msp_irq.c:133: error: implicit declaration of function 'set_vi_handler' This error was found with vampyr. Signed-off-by: Stefan Hengelein Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: ryazanov.s.a@gmail.com Patchwork: https://patchwork.linux-mips.org/patch/8122/ Signed-off-by: Ralf Baechle --- arch/mips/pmcs-msp71xx/msp_irq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/pmcs-msp71xx/msp_irq.c b/arch/mips/pmcs-msp71xx/msp_irq.c index f914c753de21..8d53d7a2ed45 100644 --- a/arch/mips/pmcs-msp71xx/msp_irq.c +++ b/arch/mips/pmcs-msp71xx/msp_irq.c @@ -16,6 +16,7 @@ #include #include +#include #include -- cgit v1.2.3 From 44da7623e307bba3eefd7eaf11e605e15a923d72 Mon Sep 17 00:00:00 2001 From: Stefan Hengelein Date: Mon, 20 Oct 2014 15:11:39 +0200 Subject: MIPS: ath79: Fix compilation error when CONFIG_PCI is disabled When CONFIG_PCI is disabled, 'db120_pci_init()' had a different signature than when was enabled. Therefore, compilation failed when CONFIG_PCI was not present. arch/mips/ath79/mach-db120.c:132: error: too many arguments to function 'db120_pci_init' This error was found with vampyr. Signed-off-by: Stefan Hengelein Reviewed-by: Markos Chandras Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Cc: geert@linux-m68k.org Patchwork: https://patchwork.linux-mips.org/patch/8119/ Signed-off-by: Ralf Baechle --- arch/mips/ath79/mach-db120.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/ath79/mach-db120.c b/arch/mips/ath79/mach-db120.c index 4d661a1d2dae..9423f5aed287 100644 --- a/arch/mips/ath79/mach-db120.c +++ b/arch/mips/ath79/mach-db120.c @@ -113,7 +113,7 @@ static void __init db120_pci_init(u8 *eeprom) ath79_register_pci(); } #else -static inline void db120_pci_init(void) {} +static inline void db120_pci_init(u8 *eeprom) {} #endif /* CONFIG_PCI */ static void __init db120_setup(void) -- cgit v1.2.3 From 9e8beeb79ded25c5c1986f80fb8a7f6815345d5a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 18:58:48 -0600 Subject: audit: Remove "weak" from audit_classify_compat_syscall() declaration There's only one audit_classify_compat_syscall() definition, so it doesn't need to be weak. Remove the "weak" attribute from the audit_classify_compat_syscall() declaration. Signed-off-by: Bjorn Helgaas Acked-by: Richard Guy Briggs CC: AKASHI Takahiro --- include/linux/audit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index 36dffeccebdb..e58fe7df8b9c 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -90,7 +90,7 @@ extern unsigned compat_dir_class[]; extern unsigned compat_chattr_class[]; extern unsigned compat_signal_class[]; -extern int __weak audit_classify_compat_syscall(int abi, unsigned syscall); +extern int audit_classify_compat_syscall(int abi, unsigned syscall); /* audit_names->type values */ #define AUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ -- cgit v1.2.3 From 754f0da0dc33e8322f7f447aac8e16d581c324ab Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 18:01:34 -0600 Subject: x86, intel-mid: Remove "weak" from function declarations For the following interfaces: get_penwell_ops() get_cloverview_ops() get_tangier_ops() there is only one implementation, so they do not need to be marked "weak". Remove the "weak" attribute from their declarations. Signed-off-by: Bjorn Helgaas Acked-by: Ingo Molnar CC: David Cohen CC: Kuppuswamy Sathyanarayanan CC: x86@kernel.org --- arch/x86/platform/intel-mid/intel_mid_weak_decls.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h index 46aa25c8ce06..3c1c3866d82b 100644 --- a/arch/x86/platform/intel-mid/intel_mid_weak_decls.h +++ b/arch/x86/platform/intel-mid/intel_mid_weak_decls.h @@ -10,10 +10,9 @@ */ -/* __attribute__((weak)) makes these declarations overridable */ /* For every CPU addition a new get__ops interface needs * to be added. */ -extern void *get_penwell_ops(void) __attribute__((weak)); -extern void *get_cloverview_ops(void) __attribute__((weak)); -extern void *get_tangier_ops(void) __attribute__((weak)); +extern void *get_penwell_ops(void); +extern void *get_cloverview_ops(void); +extern void *get_tangier_ops(void); -- cgit v1.2.3 From 96a2adbc6f501996418da9f7afe39bf0e4d006a9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 18:59:09 -0600 Subject: clocksource: Remove "weak" from clocksource_default_clock() declaration kernel/time/jiffies.c provides a default clocksource_default_clock() definition explicitly marked "weak". arch/s390 provides its own definition intended to override the default, but the "weak" attribute on the declaration applied to the s390 definition as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the clocksource_default_clock() declaration so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: f1b82746c1e9 ("clocksource: Cleanup clocksource selection") Signed-off-by: Bjorn Helgaas Acked-by: John Stultz Acked-by: Ingo Molnar CC: Daniel Lezcano CC: Martin Schwidefsky --- include/linux/clocksource.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 653f0e2b6ca9..abcafaa20b86 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -287,7 +287,7 @@ extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); -extern struct clocksource * __init __weak clocksource_default_clock(void); +extern struct clocksource * __init clocksource_default_clock(void); extern void clocksource_mark_unstable(struct clocksource *cs); extern u64 -- cgit v1.2.3 From 5ab03ac5aaa1f032e071f1b3dc433b7839359c03 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 18:59:41 -0600 Subject: vmcore: Remove "weak" from function declarations For the following functions: elfcorehdr_alloc() elfcorehdr_free() elfcorehdr_read() elfcorehdr_read_notes() remap_oldmem_pfn_range() fs/proc/vmcore.c provides default definitions explicitly marked "weak". arch/s390 provides its own definitions intended to override the default ones, but the "weak" attribute on the declarations applied to the s390 definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declarations so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: be8a8d069e50 ("vmcore: introduce ELF header in new memory feature") Fixes: 9cb218131de1 ("vmcore: introduce remap_oldmem_pfn_range()") Signed-off-by: Bjorn Helgaas Acked-by: Andrew Morton Acked-by: Vivek Goyal CC: Michael Holzheu --- include/linux/crash_dump.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 72ab536ad3de..3849fce7ecfe 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -14,14 +14,13 @@ extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; -extern int __weak elfcorehdr_alloc(unsigned long long *addr, - unsigned long long *size); -extern void __weak elfcorehdr_free(unsigned long long addr); -extern ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos); -extern ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); -extern int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma, - unsigned long from, unsigned long pfn, - unsigned long size, pgprot_t prot); +extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); +extern void elfcorehdr_free(unsigned long long addr); +extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos); +extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); +extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot); extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); -- cgit v1.2.3 From 4fbf81ca53805ff89336c70bff58365ef1e1ab70 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 20 Oct 2014 10:17:04 -0600 Subject: ARC: kgdb: generic kgdb_arch_pc() suffices The ARC version of kgdb_arch_pc() is identical to the generic version in kernel/debug/debug_core.c. Drop the ARC version so we use the generic one. Signed-off-by: Vineet Gupta Signed-off-by: Bjorn Helgaas --- arch/arc/kernel/kgdb.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/arc/kernel/kgdb.c b/arch/arc/kernel/kgdb.c index a2ff5c5d1450..ecf6a7869375 100644 --- a/arch/arc/kernel/kgdb.c +++ b/arch/arc/kernel/kgdb.c @@ -158,11 +158,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, return -1; } -unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs) -{ - return instruction_pointer(regs); -} - int kgdb_arch_init(void) { single_step_data.armed = 0; -- cgit v1.2.3 From 107bcc6d566cb40184068d888637f9aefe6252dd Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 19:00:25 -0600 Subject: kgdb: Remove "weak" from kgdb_arch_pc() declaration kernel/debug/debug_core.c provides a default kgdb_arch_pc() definition explicitly marked "weak". Several architectures provide their own definitions intended to override the default, but the "weak" attribute on the declaration applied to the arch definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declaration so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: 688b744d8bc8 ("kgdb: fix signedness mixmatches, add statics, add declaration to header") Tested-by: Vineet Gupta # for ARC build Signed-off-by: Bjorn Helgaas Reviewed-by: Harvey Harrison --- include/linux/kgdb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h index 6b06d378f3df..e465bb15912d 100644 --- a/include/linux/kgdb.h +++ b/include/linux/kgdb.h @@ -283,7 +283,7 @@ struct kgdb_io { extern struct kgdb_arch arch_kgdb_ops; -extern unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs); +extern unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs); #ifdef CONFIG_SERIAL_KGDB_NMI extern int kgdb_register_nmi_console(void); -- cgit v1.2.3 From e0a8400c6923a163265d52798cdd4c33f3f8ab5a Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 19:00:47 -0600 Subject: memory-hotplug: Remove "weak" from memory_block_size_bytes() declaration drivers/base/memory.c provides a default memory_block_size_bytes() definition explicitly marked "weak". Several architectures provide their own definitions intended to override the default, but the "weak" attribute on the declaration applied to the arch definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declaration so we always prefer a non-weak definition over the weak one, independent of link order. Fixes: 41f107266b19 ("drivers: base: Add prototype declaration to the header file") Signed-off-by: Bjorn Helgaas Acked-by: Andrew Morton CC: Rashika Kheria CC: Nathan Fontenot CC: Anton Blanchard CC: Heiko Carstens CC: Yinghai Lu --- include/linux/memory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memory.h b/include/linux/memory.h index bb7384e3c3d8..8b8d8d12348e 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -35,7 +35,7 @@ struct memory_block { }; int arch_get_memory_phys_device(unsigned long start_pfn); -unsigned long __weak memory_block_size_bytes(void); +unsigned long memory_block_size_bytes(void); /* These states are exposed to userspace as text strings in sysfs */ #define MEM_ONLINE (1<<0) /* exposed to userspace */ -- cgit v1.2.3 From 271a9c35158910496f6fc3a635c2ed85df6be3d9 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 13 Oct 2014 19:01:03 -0600 Subject: uprobes: Remove "weak" from function declarations For the following interfaces: set_swbp() set_orig_insn() is_swbp_insn() is_trap_insn() uprobe_get_swbp_addr() arch_uprobe_ignore() arch_uprobe_copy_ixol() kernel/events/uprobes.c provides default definitions explicitly marked "weak". Some architectures provide their own definitions intended to override the defaults, but the "weak" attribute on the declarations applied to the arch definitions as well, so the linker chose one based on link order (see 10629d711ed7 ("PCI: Remove __weak annotation from pcibios_get_phb_of_node decl")). Remove the "weak" attribute from the declarations so we always prefer a non-weak definition over the weak one, independent of link order. Signed-off-by: Bjorn Helgaas Acked-by: Ingo Molnar Acked-by: Srikar Dronamraju CC: Victor Kamensky CC: Oleg Nesterov CC: David A. Long CC: Ananth N Mavinakayanahalli --- include/linux/uprobes.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index 4f844c6b03ee..60beb5dc7977 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -98,11 +98,11 @@ struct uprobes_state { struct xol_area *xol_area; }; -extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); -extern bool __weak is_swbp_insn(uprobe_opcode_t *insn); -extern bool __weak is_trap_insn(uprobe_opcode_t *insn); -extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); +extern int set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern int set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr); +extern bool is_swbp_insn(uprobe_opcode_t *insn); +extern bool is_trap_insn(uprobe_opcode_t *insn); +extern unsigned long uprobe_get_swbp_addr(struct pt_regs *regs); extern unsigned long uprobe_get_trap_addr(struct pt_regs *regs); extern int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, uprobe_opcode_t); extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc); @@ -128,8 +128,8 @@ extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs); -extern bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); -extern void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, +extern bool arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs); +extern void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, void *src, unsigned long len); #else /* !CONFIG_UPROBES */ struct uprobes_state { -- cgit v1.2.3 From c0c3e735fa7bae29c6623511127fd021b2d6d849 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Fri, 11 Apr 2014 08:56:23 +0200 Subject: drm/cirrus: bind also to qemu-xen-traditional qemu as used by xend/xm toolstack uses a different subvendor id. Bind the drm driver also to this emulated card. Signed-off-by: Olaf Hering cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/cirrus/cirrus_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/cirrus/cirrus_drv.c b/drivers/gpu/drm/cirrus/cirrus_drv.c index e705335101a5..c2a1cba1e984 100644 --- a/drivers/gpu/drm/cirrus/cirrus_drv.c +++ b/drivers/gpu/drm/cirrus/cirrus_drv.c @@ -32,6 +32,8 @@ static struct drm_driver driver; static const struct pci_device_id pciidlist[] = { { PCI_VENDOR_ID_CIRRUS, PCI_DEVICE_ID_CIRRUS_5446, 0x1af4, 0x1100, 0, 0, 0 }, + { PCI_VENDOR_ID_CIRRUS, PCI_DEVICE_ID_CIRRUS_5446, PCI_VENDOR_ID_XEN, + 0x0001, 0, 0, 0 }, {0,} }; -- cgit v1.2.3 From cdb685ad44996e9a113a10002cb42d40ff29db99 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Tue, 14 Oct 2014 00:42:08 +0300 Subject: MIPS: ptrace.h: Add a missing include Commit a79ebea62010 (MIPS: ptrace: Fix user pt_regs definition, use in ptrace_{get, set}regs()) converted struct pt_regs to use __u64. Some userspace applications (e.g. GDB) include this file directly, and fail to see this type. Fix by including . The patch fixes the following build failure with GDB 7.8 when using GLIBC headers created against Linux 3.17: In file included from /home/aaro/los/work/shared/gdb-7.8/gdb/mips-linux-nat.c:37:0: /home/aaro/los/work/mips/rootfs/mips-linux-gnu/usr/include/asm/ptrace.h:32:2: error: unknown type name '__u64' __u64 regs[32]; ^ /home/aaro/los/work/mips/rootfs/mips-linux-gnu/usr/include/asm/ptrace.h:35:2: error: unknown type name '__u64' __u64 lo; ^ /home/aaro/los/work/mips/rootfs/mips-linux-gnu/usr/include/asm/ptrace.h:36:2: error: unknown type name '__u64' __u64 hi; ^ Fixes: a79ebea62010 ("MIPS: ptrace: Fix user pt_regs definition, use in ptrace_{get, set}regs()") Cc: stable@vger.kernel.org # 3.17 Signed-off-by: Aaro Koskinen Cc: Alex Smith Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/8067/ Signed-off-by: Ralf Baechle --- arch/mips/include/uapi/asm/ptrace.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/mips/include/uapi/asm/ptrace.h b/arch/mips/include/uapi/asm/ptrace.h index bbcfb8ba8106..91a3d197ede3 100644 --- a/arch/mips/include/uapi/asm/ptrace.h +++ b/arch/mips/include/uapi/asm/ptrace.h @@ -9,6 +9,8 @@ #ifndef _UAPI_ASM_PTRACE_H #define _UAPI_ASM_PTRACE_H +#include + /* 0 - 31 are integer registers, 32 - 63 are fp registers. */ #define FPR_BASE 32 #define PC 64 -- cgit v1.2.3 From 3d1f9dda48d429d9b252e5df23c1ee23617461af Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Fri, 17 Oct 2014 10:49:38 +0100 Subject: MIPS: Sibyte: Include the swarm subdir to the sb1250 LittleSur builds Fixes the following randconfig build problem: arch/mips/built-in.o: In function `show_cpuinfo': proc.c:(.text+0xde84): undefined reference to `get_system_type' arch/mips/built-in.o: In function `sb1250_setup': (.init.text+0x428): undefined reference to `get_system_type' arch/mips/built-in.o: In function `setup_arch': (.init.text+0x178c): undefined reference to `plat_mem_setup' Makefile:930: recipe for target 'vmlinux' failed Signed-off-by: Markos Chandras Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/8106/ Signed-off-by: Ralf Baechle --- arch/mips/sibyte/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/sibyte/Makefile b/arch/mips/sibyte/Makefile index c8ed2c807e69..455c40d6d625 100644 --- a/arch/mips/sibyte/Makefile +++ b/arch/mips/sibyte/Makefile @@ -25,3 +25,4 @@ obj-$(CONFIG_SIBYTE_RHONE) += swarm/ obj-$(CONFIG_SIBYTE_SENTOSA) += swarm/ obj-$(CONFIG_SIBYTE_SWARM) += swarm/ obj-$(CONFIG_SIBYTE_BIGSUR) += swarm/ +obj-$(CONFIG_SIBYTE_LITTLESUR) += swarm/ -- cgit v1.2.3 From fd8b79511349efd1f0decea920f61b93acb34a75 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Tue, 7 Oct 2014 17:00:07 -0400 Subject: xen/balloon: Don't continue ballooning when BP_ECANCELED is encountered Commit 3dcf63677d4e ("xen/balloon: cancel ballooning if adding new memory failed") makes reserve_additional_memory() return BP_ECANCELED when an error is encountered. This error, however, is ignored by the caller (balloon_process()) since it is overwritten by subsequent call to update_schedule(). This results in continuous attempts to add more memory, all of which are likely to fail again. We should stop trying to schedule next iteration of ballooning when the current one has failed. Signed-off-by: Boris Ostrovsky Reviewed-by: Daniel Kiper Signed-off-by: David Vrabel --- drivers/xen/balloon.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 1e0a317d3dcd..3860d02729dc 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -167,6 +167,9 @@ static struct page *balloon_next_page(struct page *page) static enum bp_state update_schedule(enum bp_state state) { + if (state == BP_ECANCELED) + return BP_ECANCELED; + if (state == BP_DONE) { balloon_stats.schedule_delay = 1; balloon_stats.retry_count = 1; -- cgit v1.2.3 From 239af7c7132a617f9dcd05da1dc92b96bc6d0645 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 14 Oct 2014 11:00:18 +0200 Subject: x86/xen: avoid writing to freed memory after race in p2m handling In case a race was detected during allocation of a new p2m tree element in alloc_p2m() the new allocated mid_mfn page is freed without updating the pointer to the found value in the tree. This will result in overwriting the just freed page with the mfn of the p2m leaf. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/p2m.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 9f5983b01ed9..4534320e66e4 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -566,6 +566,7 @@ static bool alloc_p2m(unsigned long pfn) /* Separately check the mid mfn level */ unsigned long missing_mfn; unsigned long mid_mfn_mfn; + unsigned long old_mfn; mid_mfn = alloc_p2m_page(); if (!mid_mfn) @@ -575,10 +576,13 @@ static bool alloc_p2m(unsigned long pfn) missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); mid_mfn_mfn = virt_to_mfn(mid_mfn); - if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn) + old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); + if (old_mfn != missing_mfn) { free_p2m_page(mid_mfn); - else + mid_mfn = mfn_to_virt(old_mfn); + } else { p2m_top_mfn_p[topidx] = mid_mfn; + } } if (p2m_top[topidx][mididx] == p2m_identity || -- cgit v1.2.3 From 2c185687ab016954557aac80074f5d7f7f5d275c Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 14 Oct 2014 13:33:46 +0200 Subject: x86/xen: delay construction of mfn_list_list The 3 level p2m tree for the Xen tools is constructed very early at boot by calling xen_build_mfn_list_list(). Memory needed for this tree is allocated via extend_brk(). As this tree (other than the kernel internal p2m tree) is only needed for domain save/restore, live migration and crash dump analysis it doesn't matter whether it is constructed very early or just some milliseconds later when memory allocation is possible by other means. This patch moves the call of xen_build_mfn_list_list() just after calling xen_pagetable_p2m_copy() simplifying this function, too, as it doesn't have to bother with two parallel trees now. The same applies for some other internal functions. While simplifying code, make early_can_reuse_p2m_middle() static and drop the unused second parameter. p2m_mid_identity_mfn can be removed as well, it isn't used either. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/enlighten.c | 3 --- arch/x86/xen/mmu.c | 5 +++- arch/x86/xen/p2m.c | 65 +++++++++++------------------------------------- 3 files changed, 18 insertions(+), 55 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1a3f0445432a..fac5e4f9607c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1636,9 +1636,6 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_raw_console_write("mapping kernel into physical memory\n"); xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, xen_start_info->nr_pages); - /* Allocate and initialize top and mid mfn levels for p2m structure */ - xen_build_mfn_list_list(); - /* keep using Xen gdt for now; no urgent need to change it */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f62af7647ec9..a8a1a3d08d4d 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1217,10 +1217,13 @@ static void __init xen_pagetable_p2m_copy(void) static void __init xen_pagetable_init(void) { paging_init(); - xen_setup_shared_info(); #ifdef CONFIG_X86_64 xen_pagetable_p2m_copy(); #endif + /* Allocate and initialize top and mid mfn levels for p2m structure */ + xen_build_mfn_list_list(); + + xen_setup_shared_info(); xen_post_allocator_init(); } static void xen_write_cr2(unsigned long cr2) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 4534320e66e4..d1b3da2960ab 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -163,6 +163,7 @@ #include #include #include +#include #include #include @@ -181,21 +182,20 @@ static void __init m2p_override_init(void); unsigned long xen_max_p2m_pfn __read_mostly; +static unsigned long *p2m_mid_missing_mfn; +static unsigned long *p2m_top_mfn; +static unsigned long **p2m_top_mfn_p; + /* Placeholders for holes in the address space */ static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE); static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE); -static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE); RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); -RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE))); /* For each I/O range remapped we may lose up to two leaf pages for the boundary * violations and three mid pages to cover up to 3GB. With @@ -272,11 +272,11 @@ static void p2m_init(unsigned long *p2m) * Build the parallel p2m_top_mfn and p2m_mid_mfn structures * * This is called both at boot time, and after resuming from suspend: - * - At boot time we're called very early, and must use extend_brk() + * - At boot time we're called rather early, and must use alloc_bootmem*() * to allocate memory. * * - After resume we're called from within stop_machine, but the mfn - * tree should alreay be completely allocated. + * tree should already be completely allocated. */ void __ref xen_build_mfn_list_list(void) { @@ -287,20 +287,17 @@ void __ref xen_build_mfn_list_list(void) /* Pre-initialize p2m_top_mfn to be completely missing */ if (p2m_top_mfn == NULL) { - p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); - p2m_mid_identity_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); - p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_p_init(p2m_top_mfn_p); - p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE); + p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_top_mfn_init(p2m_top_mfn); } else { /* Reinitialise, mfn's all change after migration */ p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing); - p2m_mid_mfn_init(p2m_mid_identity_mfn, p2m_identity); } for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) { @@ -328,10 +325,9 @@ void __ref xen_build_mfn_list_list(void) /* * XXX boot-time only! We should never find * missing parts of the mfn tree after - * runtime. extend_brk() will BUG if we call - * it too late. + * runtime. */ - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); + mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); p2m_mid_mfn_init(mid_mfn_p, p2m_missing); p2m_top_mfn_p[topidx] = mid_mfn_p; @@ -415,7 +411,6 @@ void __init xen_build_dynamic_phys_to_machine(void) m2p_override_init(); } #ifdef CONFIG_X86_64 -#include unsigned long __init xen_revector_p2m_tree(void) { unsigned long va_start; @@ -477,7 +472,6 @@ unsigned long __init xen_revector_p2m_tree(void) copy_page(new, mid_p); p2m_top[topidx][mididx] = &mfn_list[pfn_free]; - p2m_top_mfn_p[topidx][mididx] = virt_to_mfn(&mfn_list[pfn_free]); pfn_free += P2M_PER_PAGE; @@ -610,7 +604,6 @@ static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) { unsigned topidx, mididx, idx; unsigned long *p2m; - unsigned long *mid_mfn_p; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); @@ -637,43 +630,21 @@ static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary) p2m_top[topidx][mididx] = p2m; - /* For save/restore we need to MFN of the P2M saved */ - - mid_mfn_p = p2m_top_mfn_p[topidx]; - WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing), - "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n", - topidx, mididx); - mid_mfn_p[mididx] = virt_to_mfn(p2m); - return true; } static bool __init early_alloc_p2m_middle(unsigned long pfn) { unsigned topidx = p2m_top_index(pfn); - unsigned long *mid_mfn_p; unsigned long **mid; mid = p2m_top[topidx]; - mid_mfn_p = p2m_top_mfn_p[topidx]; if (mid == p2m_mid_missing) { mid = extend_brk(PAGE_SIZE, PAGE_SIZE); p2m_mid_init(mid, p2m_missing); p2m_top[topidx] = mid; - - BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); - } - /* And the save/restore P2M tables.. */ - if (mid_mfn_p == p2m_mid_missing_mfn) { - mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE); - p2m_mid_mfn_init(mid_mfn_p, p2m_missing); - - p2m_top_mfn_p[topidx] = mid_mfn_p; - p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); - /* Note: we don't set mid_mfn_p[midix] here, - * look in early_alloc_p2m() */ } return true; } @@ -684,14 +655,13 @@ static bool __init early_alloc_p2m_middle(unsigned long pfn) * replace the P2M leaf with a p2m_missing or p2m_identity. * Stick the old page in the new P2M tree location. */ -bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_mfn) +static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn) { unsigned topidx; unsigned mididx; unsigned ident_pfns; unsigned inv_pfns; unsigned long *p2m; - unsigned long *mid_mfn_p; unsigned idx; unsigned long pfn; @@ -737,11 +707,6 @@ bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_ found: /* Found one, replace old with p2m_identity or p2m_missing */ p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing); - /* And the other for save/restore.. */ - mid_mfn_p = p2m_top_mfn_p[topidx]; - /* NOTE: Even if it is a p2m_identity it should still be point to - * a page filled with INVALID_P2M_ENTRY entries. */ - mid_mfn_p[mididx] = virt_to_mfn(p2m_missing); /* Reset where we want to stick the old page in. */ topidx = p2m_top_index(set_pfn); @@ -756,8 +721,6 @@ found: p2m_init(p2m); p2m_top[topidx][mididx] = p2m; - mid_mfn_p = p2m_top_mfn_p[topidx]; - mid_mfn_p[mididx] = virt_to_mfn(p2m); return true; } @@ -767,7 +730,7 @@ bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn) if (!early_alloc_p2m_middle(pfn)) return false; - if (early_can_reuse_p2m_middle(pfn, mfn)) + if (early_can_reuse_p2m_middle(pfn)) return __set_phys_to_machine(pfn, mfn); if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/)) -- cgit v1.2.3 From 3a0e94f8ead4a58b9719db0f78e13d02d059604f Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Fri, 17 Oct 2014 13:16:06 +0200 Subject: x86/xen: avoid race in p2m handling When a new p2m leaf is allocated this leaf is linked into the p2m tree via cmpxchg. Unfortunately the compare value for checking the success of the update is read after checking for the need of a new leaf. It is possible that a new leaf has been linked into the tree concurrently in between. This could lead to a leaked memory page and to the loss of some p2m entries. Avoid the race by using the read compare value for checking the need of a new p2m leaf and use ACCESS_ONCE() to get it. There are other places which seem to need ACCESS_ONCE() to ensure proper operation. Change them accordingly. Signed-off-by: Juergen Gross Signed-off-by: David Vrabel --- arch/x86/xen/p2m.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index d1b3da2960ab..b456b048eca9 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -532,12 +532,13 @@ static bool alloc_p2m(unsigned long pfn) unsigned topidx, mididx; unsigned long ***top_p, **mid; unsigned long *top_mfn_p, *mid_mfn; + unsigned long *p2m_orig; topidx = p2m_top_index(pfn); mididx = p2m_mid_index(pfn); top_p = &p2m_top[topidx]; - mid = *top_p; + mid = ACCESS_ONCE(*top_p); if (mid == p2m_mid_missing) { /* Mid level is missing, allocate a new one */ @@ -552,7 +553,7 @@ static bool alloc_p2m(unsigned long pfn) } top_mfn_p = &p2m_top_mfn[topidx]; - mid_mfn = p2m_top_mfn_p[topidx]; + mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]); BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); @@ -579,11 +580,10 @@ static bool alloc_p2m(unsigned long pfn) } } - if (p2m_top[topidx][mididx] == p2m_identity || - p2m_top[topidx][mididx] == p2m_missing) { + p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]); + if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) { /* p2m leaf page is missing */ unsigned long *p2m; - unsigned long *p2m_orig = p2m_top[topidx][mididx]; p2m = alloc_p2m_page(); if (!p2m) -- cgit v1.2.3 From 3251f20b897f955ab7f153181b4ebfbb2317cbb9 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 16 Oct 2014 17:02:15 -0400 Subject: x86/xen: Fix incorrect per_cpu accessor in xen_clocksource_read() Commit 89cbc76768c2 ("x86: Replace __get_cpu_var uses") replaced __get_cpu_var() with this_cpu_ptr() in xen_clocksource_read() in such a way that instead of accessing a structure pointed to by a per-cpu pointer we are trying to get to a per-cpu structure. __this_cpu_read() of the pointer is the more appropriate accessor. Signed-off-by: Boris Ostrovsky Signed-off-by: David Vrabel --- arch/x86/xen/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index a1d430b112b3..f473d268d387 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -158,7 +158,7 @@ cycle_t xen_clocksource_read(void) cycle_t ret; preempt_disable_notrace(); - src = this_cpu_ptr(&xen_vcpu->time); + src = &__this_cpu_read(xen_vcpu)->time; ret = pvclock_clocksource_read(src); preempt_enable_notrace(); return ret; -- cgit v1.2.3 From 1ea644c8f93f3435760d8b88c25d56475d8b7778 Mon Sep 17 00:00:00 2001 From: Martin Kelly Date: Thu, 16 Oct 2014 20:48:11 -0700 Subject: x86/xen: panic on bad Xen-provided memory map Panic if Xen provides a memory map with 0 entries. Although this is unlikely, it is better to catch the error at the point of seeing the map than later on as a symptom of some other crash. Signed-off-by: Martin Kelly Signed-off-by: David Vrabel --- arch/x86/xen/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index af7216128d93..29834b3fd87f 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -595,6 +595,7 @@ char * __init xen_memory_setup(void) rc = 0; } BUG_ON(rc); + BUG_ON(memmap.nr_entries == 0); /* * Xen won't allow a 1:1 mapping to be created to UNUSABLE -- cgit v1.2.3 From 486edb24952c930966dad125f6727017353e9361 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Mon, 4 Aug 2014 18:17:23 -0400 Subject: xen/pci: Allocate memory for physdev_pci_device_add's optarr physdev_pci_device_add's optarr[] is a zero-sized array and therefore reference to add.optarr[0] is accessing memory that does not belong to the 'add' variable. Signed-off-by: Boris Ostrovsky Reviewed-by: Jan Beulich Signed-off-by: David Vrabel --- drivers/xen/pci.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index dd9c249ea311..95ee4302ffb8 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -41,24 +41,29 @@ static int xen_add_device(struct device *dev) #endif if (pci_seg_supported) { - struct physdev_pci_device_add add = { - .seg = pci_domain_nr(pci_dev->bus), - .bus = pci_dev->bus->number, - .devfn = pci_dev->devfn + struct { + struct physdev_pci_device_add add; + uint32_t pxm; + } add_ext = { + .add.seg = pci_domain_nr(pci_dev->bus), + .add.bus = pci_dev->bus->number, + .add.devfn = pci_dev->devfn }; + struct physdev_pci_device_add *add = &add_ext.add; + #ifdef CONFIG_ACPI acpi_handle handle; #endif #ifdef CONFIG_PCI_IOV if (pci_dev->is_virtfn) { - add.flags = XEN_PCI_DEV_VIRTFN; - add.physfn.bus = physfn->bus->number; - add.physfn.devfn = physfn->devfn; + add->flags = XEN_PCI_DEV_VIRTFN; + add->physfn.bus = physfn->bus->number; + add->physfn.devfn = physfn->devfn; } else #endif if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) - add.flags = XEN_PCI_DEV_EXTFN; + add->flags = XEN_PCI_DEV_EXTFN; #ifdef CONFIG_ACPI handle = ACPI_HANDLE(&pci_dev->dev); @@ -77,8 +82,8 @@ static int xen_add_device(struct device *dev) status = acpi_evaluate_integer(handle, "_PXM", NULL, &pxm); if (ACPI_SUCCESS(status)) { - add.optarr[0] = pxm; - add.flags |= XEN_PCI_DEV_PXM; + add->optarr[0] = pxm; + add->flags |= XEN_PCI_DEV_PXM; break; } status = acpi_get_parent(handle, &handle); @@ -86,7 +91,7 @@ static int xen_add_device(struct device *dev) } #endif /* CONFIG_ACPI */ - r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add); + r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, add); if (r != -ENOSYS) return r; pci_seg_supported = false; -- cgit v1.2.3 From 9d26024c964718c6307a96f387b24fc06dac4d91 Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Tue, 16 Sep 2014 13:57:08 +0100 Subject: MIPS: idle: Remove leftover __pastwait symbol and its references The __pastwait symbol was only used by the address_is_in_r4k_wait_irqoff function but this is no longer used since the SMTC removal in commit b633648c5ad3 ('MIPS: MT: Remove SMTC support'). That symbol also led to build failures under certain random configuration due to the way the compiler compiled the r4k_wait_irqoff function. If that function was called multiple times, the __pastwait symbol was redefined breaking the build like this: CHK include/generated/compile.h CC arch/mips/kernel/idle.o {standard input}: Assembler messages: {standard input}:527: Error: symbol `__pastwait' is already defined Link: http://www.linux-mips.org/cgi-bin/mesg.cgi?a=linux-mips&i=1244879922.24479.30.camel%40falcon Signed-off-by: Markos Chandras Cc: linux-mips@linux-mips.org Cc: Markos Chandras Patchwork: https://patchwork.linux-mips.org/patch/7791/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/idle.h | 7 ------- arch/mips/kernel/idle.c | 3 --- 2 files changed, 10 deletions(-) diff --git a/arch/mips/include/asm/idle.h b/arch/mips/include/asm/idle.h index d9f932de80e9..1c967abd545c 100644 --- a/arch/mips/include/asm/idle.h +++ b/arch/mips/include/asm/idle.h @@ -8,19 +8,12 @@ extern void (*cpu_wait)(void); extern void r4k_wait(void); extern asmlinkage void __r4k_wait(void); extern void r4k_wait_irqoff(void); -extern void __pastwait(void); static inline int using_rollback_handler(void) { return cpu_wait == r4k_wait; } -static inline int address_is_in_r4k_wait_irqoff(unsigned long addr) -{ - return addr >= (unsigned long)r4k_wait_irqoff && - addr < (unsigned long)__pastwait; -} - extern int mips_cpuidle_wait_enter(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); diff --git a/arch/mips/kernel/idle.c b/arch/mips/kernel/idle.c index 09ce45980758..0b9082b6b683 100644 --- a/arch/mips/kernel/idle.c +++ b/arch/mips/kernel/idle.c @@ -68,9 +68,6 @@ void r4k_wait_irqoff(void) " wait \n" " .set pop \n"); local_irq_enable(); - __asm__( - " .globl __pastwait \n" - "__pastwait: \n"); } /* -- cgit v1.2.3 From 39a595935576cd56e3974830f6114e1a70bac65d Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Thu, 18 Sep 2014 16:09:49 +0100 Subject: MIPS: Kconfig: Add missing MIPS_CPS dependencies to PM and cpuidle The MIPS_CPS_PM and MIPS_CPS_CPUIDLE implementation should depend on the MIPS_CPS symbol to avoid the following build problem arch/mips/kernel/pm-cps.c: In function 'cps_pm_enter_state': arch/mips/kernel/pm-cps.c:164:26: error: 'cpu_coherent_mask' undeclared (first use in this function) cpumask_clear_cpu(cpu, &cpu_coherent_mask); ^ Signed-off-by: Markos Chandras Cc: Paul Burton Cc: linux-mips@linux-mips.org Patchwork: http://patchwork.linux-mips.org/patch/7798/ Signed-off-by: Ralf Baechle --- arch/mips/Kconfig | 1 + drivers/cpuidle/Kconfig.mips | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index ad6badb6be71..f43aa536c517 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2066,6 +2066,7 @@ config MIPS_CPS support is unavailable. config MIPS_CPS_PM + depends on MIPS_CPS select MIPS_CPC bool diff --git a/drivers/cpuidle/Kconfig.mips b/drivers/cpuidle/Kconfig.mips index 0e70ee28a5ca..4102be01d06a 100644 --- a/drivers/cpuidle/Kconfig.mips +++ b/drivers/cpuidle/Kconfig.mips @@ -3,7 +3,7 @@ # config MIPS_CPS_CPUIDLE bool "CPU Idle driver for MIPS CPS platforms" - depends on CPU_IDLE + depends on CPU_IDLE && MIPS_CPS depends on SYS_SUPPORTS_MIPS_CPS select ARCH_NEEDS_CPU_IDLE_COUPLED if MIPS_MT select GENERIC_CLOCKEVENTS_BROADCAST if SMP -- cgit v1.2.3 From b89f30683970fd2011eeae83b5161764aab3e21b Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Thu, 25 Sep 2014 10:26:15 +0100 Subject: MIPS: Prevent compiler warning from cop2_{save,restore} The no-op cases of cop2_save & cop2_restore lead to the following warnings being emitted during build with recent versions of gcc (tested using gcc 4.8.3 from the Mentor Sourcery CodeBench 2014.05 toolchain): In file included from ./arch/mips/include/asm/switch_to.h:18:0, from kernel/sched/core.c:78: kernel/sched/core.c: In function 'finish_task_switch': include/asm-generic/current.h:6:45: warning: value computed is not used [-Wunused-value] #define get_current() (current_thread_info()->task) ^ ./arch/mips/include/asm/cop2.h:48:32: note: in definition of macro 'cop2_restore' #define cop2_restore(r) do { (r); } while (0) ^ include/asm-generic/current.h:7:17: note: in expansion of macro 'get_current' #define current get_current() ^ ./arch/mips/include/asm/switch_to.h:114:16: note: in expansion of macro 'current' cop2_restore(current); \ ^ kernel/sched/core.c:2225:2: note: in expansion of macro 'finish_arch_switch' finish_arch_switch(prev); ^ Avoid the warning by "using" the value by casting to void. Signed-off-by: Paul Burton Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/7880/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/cop2.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/include/asm/cop2.h b/arch/mips/include/asm/cop2.h index 51f80bd36fcc..63b3468ede4c 100644 --- a/arch/mips/include/asm/cop2.h +++ b/arch/mips/include/asm/cop2.h @@ -37,15 +37,15 @@ extern void nlm_cop2_restore(struct nlm_cop2_state *); #define cop2_present 1 #define cop2_lazy_restore 1 -#define cop2_save(r) do { (r); } while (0) -#define cop2_restore(r) do { (r); } while (0) +#define cop2_save(r) do { (void)(r); } while (0) +#define cop2_restore(r) do { (void)(r); } while (0) #else #define cop2_present 0 #define cop2_lazy_restore 0 -#define cop2_save(r) do { (r); } while (0) -#define cop2_restore(r) do { (r); } while (0) +#define cop2_save(r) do { (void)(r); } while (0) +#define cop2_restore(r) do { (void)(r); } while (0) #endif enum cu2_ops { -- cgit v1.2.3 From 5e9e3a5f69cab3f14f1f91c33ab391d403617575 Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Thu, 9 Oct 2014 10:34:19 +0100 Subject: MIPS: Malta: Do not build the malta-amon.c file if CMP is not enabled The malta-amon.c file provides functions to access the YAMON Monitoring interface to bring up secondary VPEs in case of SMP/CMP. As a result of which, there is no need to build it if CMP is not used. Signed-off-by: Markos Chandras Reviewed-by: Paul Burton Cc: linux-mips@linux-mips.org Patchwork: http://patchwork.linux-mips.org/patch/7993/ Signed-off-by: Ralf Baechle --- arch/mips/mti-malta/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/mti-malta/Makefile b/arch/mips/mti-malta/Makefile index b9510ea8db56..6510ace272d4 100644 --- a/arch/mips/mti-malta/Makefile +++ b/arch/mips/mti-malta/Makefile @@ -5,8 +5,9 @@ # Copyright (C) 2008 Wind River Systems, Inc. # written by Ralf Baechle # -obj-y := malta-amon.o malta-display.o malta-init.o \ +obj-y := malta-display.o malta-init.o \ malta-int.o malta-memory.o malta-platform.o \ malta-reset.o malta-setup.o malta-time.o +obj-$(CONFIG_MIPS_CMP) += malta-amon.o obj-$(CONFIG_MIPS_MALTA_PM) += malta-pm.o -- cgit v1.2.3 From 67598a1d3140a66f57aa6bcb8d22c4c2b7e910f5 Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Thu, 23 Oct 2014 20:20:00 +0800 Subject: ACPI: invoke acpi_device_wakeup() with correct parameters Fix a bug that invokes acpi_device_wakeup() with wrong parameters. Fixes: f35cec255557 (ACPI / PM: Always enable wakeup GPEs when enabling device wakeup) Signed-off-by: Zhang Rui Cc: 3.17+ # 3.17+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/device_pm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 67075f800e34..5e9cbd664286 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -710,7 +710,7 @@ int acpi_pm_device_run_wake(struct device *phys_dev, bool enable) return -ENODEV; } - return acpi_device_wakeup(adev, enable, ACPI_STATE_S0); + return acpi_device_wakeup(adev, ACPI_STATE_S0, enable); } EXPORT_SYMBOL(acpi_pm_device_run_wake); #endif /* CONFIG_PM_RUNTIME */ -- cgit v1.2.3 From 5dfd7f9f88ba8539e630d04e17b93ccc7043c31c Mon Sep 17 00:00:00 2001 From: Lucas Stach Date: Wed, 22 Oct 2014 14:31:55 +0200 Subject: PCI / PM: handle failure to enable wakeup on PCIe PME If the irqchip handling the PCIe PME interrupt is not able to enable interrupt wakeup we should properly reflect this in the PME suspend status. This fixes a kernel warning on resume, where it would try to disable the irq wakeup that failed to be activated while suspending, for example: WARNING: CPU: 0 PID: 609 at kernel/irq/manage.c:536 irq_set_irq_wake+0xc0/0xf8() Unbalanced IRQ 384 wake disable Fixes: 76cde7e49590 (PCI / PM: Make PCIe PME interrupts wake up from suspend-to-idle) Reported-and-tested-by: Richard Zhu Signed-off-by: Lucas Stach Signed-off-by: Rafael J. Wysocki --- drivers/pci/pcie/pme.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c index a9f9c46e5022..63fc63911295 100644 --- a/drivers/pci/pcie/pme.c +++ b/drivers/pci/pcie/pme.c @@ -397,6 +397,7 @@ static int pcie_pme_suspend(struct pcie_device *srv) struct pcie_pme_service_data *data = get_service_data(srv); struct pci_dev *port = srv->port; bool wakeup; + int ret; if (device_may_wakeup(&port->dev)) { wakeup = true; @@ -407,9 +408,10 @@ static int pcie_pme_suspend(struct pcie_device *srv) } spin_lock_irq(&data->lock); if (wakeup) { - enable_irq_wake(srv->irq); + ret = enable_irq_wake(srv->irq); data->suspend_level = PME_SUSPEND_WAKEUP; - } else { + } + if (!wakeup || ret) { struct pci_dev *port = srv->port; pcie_pme_interrupt_enable(port, false); -- cgit v1.2.3 From 36b4bed5cd8f6e17019fa7d380e0836872c7b367 Mon Sep 17 00:00:00 2001 From: Pali Rohár Date: Thu, 16 Oct 2014 01:16:51 +0200 Subject: cpufreq: intel_pstate: Fix setting max_perf_pct in performance policy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code which changes policy to powersave changes also max_policy_pct based on max_freq. Code which change max_perf_pct has upper limit base on value max_policy_pct. When policy is changing from powersave back to performance then max_policy_pct is not changed. Which means that changing max_perf_pct is not possible to high values if max_freq was too low in powersave policy. Test case: $ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_min_freq 800000 $ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq 3300000 $ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor performance $ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct 100 $ echo powersave > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor $ echo 800000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq $ echo 20 > /sys/devices/system/cpu/intel_pstate/max_perf_pct $ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor powersave $ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq 800000 $ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct 20 $ echo performance > /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor $ echo 3300000 > /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq $ echo 100 > /sys/devices/system/cpu/intel_pstate/max_perf_pct $ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor performance $ cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_max_freq 3300000 $ cat /sys/devices/system/cpu/intel_pstate/max_perf_pct 24 And now intel_pstate driver allows to set maximal value for max_perf_pct based on max_policy_pct which is 24 for previous powersave max_freq 800000. This patch will set default value for max_policy_pct when setting policy to performance so it will allow to set also max value for max_perf_pct. Signed-off-by: Pali Rohár Cc: All applicable Acked-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 0668b389c516..7547ab559a97 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -714,6 +714,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { limits.min_perf_pct = 100; limits.min_perf = int_tofp(1); + limits.max_policy_pct = 100; limits.max_perf_pct = 100; limits.max_perf = int_tofp(1); limits.no_turbo = limits.turbo_disabled; -- cgit v1.2.3 From c034b02e213d271b98c45c4a7b54af8f69aaac1e Mon Sep 17 00:00:00 2001 From: Dirk Brandewie Date: Mon, 13 Oct 2014 08:37:40 -0700 Subject: cpufreq: expose scaling_cur_freq sysfs file for set_policy() drivers Currently the core does not expose scaling_cur_freq for set_policy() drivers this breaks some userspace monitoring tools. Change the core to expose this file for all drivers and if the set_policy() driver supports the get() callback use it to retrieve the current frequency. Link: https://bugzilla.kernel.org/show_bug.cgi?id=73741 Cc: All applicable Signed-off-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 058d6e084a6d..644b54e1e7d1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -512,7 +512,18 @@ show_one(cpuinfo_max_freq, cpuinfo.max_freq); show_one(cpuinfo_transition_latency, cpuinfo.transition_latency); show_one(scaling_min_freq, min); show_one(scaling_max_freq, max); -show_one(scaling_cur_freq, cur); + +static ssize_t show_scaling_cur_freq( + struct cpufreq_policy *policy, char *buf) +{ + ssize_t ret; + + if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) + ret = sprintf(buf, "%u\n", cpufreq_driver->get(policy->cpu)); + else + ret = sprintf(buf, "%u\n", policy->cur); + return ret; +} static int cpufreq_set_policy(struct cpufreq_policy *policy, struct cpufreq_policy *new_policy); @@ -906,11 +917,11 @@ static int cpufreq_add_dev_interface(struct cpufreq_policy *policy, if (ret) goto err_out_kobj_put; } - if (has_target()) { - ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); - if (ret) - goto err_out_kobj_put; - } + + ret = sysfs_create_file(&policy->kobj, &scaling_cur_freq.attr); + if (ret) + goto err_out_kobj_put; + if (cpufreq_driver->bios_limit) { ret = sysfs_create_file(&policy->kobj, &bios_limit.attr); if (ret) -- cgit v1.2.3 From 4521e1a0ce173daa23dfef8312d09051e057ff8e Mon Sep 17 00:00:00 2001 From: Gabriele Mazzotta Date: Mon, 13 Oct 2014 08:37:41 -0700 Subject: cpufreq: intel_pstate: Reflect current no_turbo state correctly Some BIOSes modify the state of MSR_IA32_MISC_ENABLE_TURBO_DISABLE based on the current power source for the system battery AC vs battery. Reflect the correct current state and ability to modify the no_turbo sysfs file based on current state of MSR_IA32_MISC_ENABLE_TURBO_DISABLE. Link: https://bugzilla.kernel.org/show_bug.cgi?id=83151 Cc: All applicable Signed-off-by: Gabriele Mazzotta Signed-off-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 48 +++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 7547ab559a97..5f5f5ed11691 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -138,6 +138,7 @@ struct perf_limits { static struct perf_limits limits = { .no_turbo = 0, + .turbo_disabled = 0, .max_perf_pct = 100, .max_perf = int_tofp(1), .min_perf_pct = 0, @@ -218,6 +219,18 @@ static inline void intel_pstate_reset_all_pid(void) } } +static inline void update_turbo_state(void) +{ + u64 misc_en; + struct cpudata *cpu; + + cpu = all_cpu_data[0]; + rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); + limits.turbo_disabled = + (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || + cpu->pstate.max_pstate == cpu->pstate.turbo_pstate); +} + /************************** debugfs begin ************************/ static int pid_param_set(void *data, u64 val) { @@ -274,6 +287,20 @@ static void __init intel_pstate_debug_expose_params(void) return sprintf(buf, "%u\n", limits.object); \ } +static ssize_t show_no_turbo(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + ssize_t ret; + + update_turbo_state(); + if (limits.turbo_disabled) + ret = sprintf(buf, "%u\n", limits.turbo_disabled); + else + ret = sprintf(buf, "%u\n", limits.no_turbo); + + return ret; +} + static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, const char *buf, size_t count) { @@ -283,11 +310,14 @@ static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, ret = sscanf(buf, "%u", &input); if (ret != 1) return -EINVAL; - limits.no_turbo = clamp_t(int, input, 0 , 1); + + update_turbo_state(); if (limits.turbo_disabled) { pr_warn("Turbo disabled by BIOS or unavailable on processor\n"); - limits.no_turbo = limits.turbo_disabled; + return -EPERM; } + limits.no_turbo = clamp_t(int, input, 0, 1); + return count; } @@ -323,7 +353,6 @@ static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, return count; } -show_one(no_turbo, no_turbo); show_one(max_perf_pct, max_perf_pct); show_one(min_perf_pct, min_perf_pct); @@ -501,7 +530,7 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) int max_perf_adj; int min_perf; - if (limits.no_turbo) + if (limits.no_turbo || limits.turbo_disabled) max_perf = cpu->pstate.max_pstate; max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits.max_perf)); @@ -516,6 +545,8 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) { int max_perf, min_perf; + update_turbo_state(); + intel_pstate_get_min_max(cpu, &min_perf, &max_perf); pstate = clamp_t(int, pstate, min_perf, max_perf); @@ -717,7 +748,7 @@ static int intel_pstate_set_policy(struct cpufreq_policy *policy) limits.max_policy_pct = 100; limits.max_perf_pct = 100; limits.max_perf = int_tofp(1); - limits.no_turbo = limits.turbo_disabled; + limits.no_turbo = 0; return 0; } limits.min_perf_pct = (policy->min * 100) / policy->cpuinfo.max_freq; @@ -760,7 +791,6 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy) { struct cpudata *cpu; int rc; - u64 misc_en; rc = intel_pstate_init_cpu(policy->cpu); if (rc) @@ -768,12 +798,6 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy) cpu = all_cpu_data[policy->cpu]; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_en); - if (misc_en & MSR_IA32_MISC_ENABLE_TURBO_DISABLE || - cpu->pstate.max_pstate == cpu->pstate.turbo_pstate) { - limits.turbo_disabled = 1; - limits.no_turbo = 1; - } if (limits.min_perf_pct == 100 && limits.max_perf_pct == 100) policy->policy = CPUFREQ_POLICY_PERFORMANCE; else -- cgit v1.2.3 From c034871712730a33e0267095f48b62eae958499c Mon Sep 17 00:00:00 2001 From: Dirk Brandewie Date: Mon, 13 Oct 2014 08:37:42 -0700 Subject: intel_pstate: Don't lose sysfs settings during cpu offline The user may have custom settings don't destroy them during suspend. Link: https://bugzilla.kernel.org/show_bug.cgi?id=80651 Reported-by: Tobias Jakobi Cc: All applicable Signed-off-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 5f5f5ed11691..97b349a6f5c7 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -702,7 +702,9 @@ static int intel_pstate_init_cpu(unsigned int cpunum) { struct cpudata *cpu; - all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), GFP_KERNEL); + if (!all_cpu_data[cpunum]) + all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), + GFP_KERNEL); if (!all_cpu_data[cpunum]) return -ENOMEM; @@ -783,8 +785,6 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) del_timer_sync(&all_cpu_data[cpu_num]->timer); intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); - kfree(all_cpu_data[cpu_num]); - all_cpu_data[cpu_num] = NULL; } static int intel_pstate_cpu_init(struct cpufreq_policy *policy) -- cgit v1.2.3 From b27580b05e6f5253228debc60b8ff4a786ff573a Mon Sep 17 00:00:00 2001 From: Dirk Brandewie Date: Mon, 13 Oct 2014 08:37:43 -0700 Subject: intel_pstate: Fix BYT frequency reporting BYT has a different conversion from P state to frequency than the core processors. This causes the min/max and current frequency to be misreported on some BYT SKUs. Tested on BYT N2820, Ivybridge and Haswell processors. Link: https://bugzilla.yoctoproject.org/show_bug.cgi?id=6663 Cc: All applicable Signed-off-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 97b349a6f5c7..98593e0344c5 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -64,6 +64,7 @@ struct pstate_data { int current_pstate; int min_pstate; int max_pstate; + int scaling; int turbo_pstate; }; @@ -113,6 +114,7 @@ struct pstate_funcs { int (*get_max)(void); int (*get_min)(void); int (*get_turbo)(void); + int (*get_scaling)(void); void (*set)(struct cpudata*, int pstate); void (*get_vid)(struct cpudata *); }; @@ -433,6 +435,22 @@ static void byt_set_pstate(struct cpudata *cpudata, int pstate) wrmsrl(MSR_IA32_PERF_CTL, val); } +#define BYT_BCLK_FREQS 5 +static int byt_freq_table[BYT_BCLK_FREQS] = { 833, 1000, 1333, 1167, 800}; + +static int byt_get_scaling(void) +{ + u64 value; + int i; + + rdmsrl(MSR_FSB_FREQ, value); + i = value & 0x3; + + BUG_ON(i > BYT_BCLK_FREQS); + + return byt_freq_table[i] * 100; +} + static void byt_get_vid(struct cpudata *cpudata) { u64 value; @@ -478,6 +496,11 @@ static int core_get_turbo_pstate(void) return ret; } +static inline int core_get_scaling(void) +{ + return 100000; +} + static void core_set_pstate(struct cpudata *cpudata, int pstate) { u64 val; @@ -502,6 +525,7 @@ static struct cpu_defaults core_params = { .get_max = core_get_max_pstate, .get_min = core_get_min_pstate, .get_turbo = core_get_turbo_pstate, + .get_scaling = core_get_scaling, .set = core_set_pstate, }, }; @@ -520,6 +544,7 @@ static struct cpu_defaults byt_params = { .get_min = byt_get_min_pstate, .get_turbo = byt_get_turbo_pstate, .set = byt_set_pstate, + .get_scaling = byt_get_scaling, .get_vid = byt_get_vid, }, }; @@ -554,7 +579,7 @@ static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) if (pstate == cpu->pstate.current_pstate) return; - trace_cpu_frequency(pstate * 100000, cpu->cpu); + trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); cpu->pstate.current_pstate = pstate; @@ -566,6 +591,7 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) cpu->pstate.min_pstate = pstate_funcs.get_min(); cpu->pstate.max_pstate = pstate_funcs.get_max(); cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); + cpu->pstate.scaling = pstate_funcs.get_scaling(); if (pstate_funcs.get_vid) pstate_funcs.get_vid(cpu); @@ -581,7 +607,9 @@ static inline void intel_pstate_calc_busy(struct cpudata *cpu) core_pct = div64_u64(core_pct, int_tofp(sample->mperf)); sample->freq = fp_toint( - mul_fp(int_tofp(cpu->pstate.max_pstate * 1000), core_pct)); + mul_fp(int_tofp( + cpu->pstate.max_pstate * cpu->pstate.scaling / 100), + core_pct)); sample->core_pct_busy = (int32_t)core_pct; } @@ -803,12 +831,13 @@ static int intel_pstate_cpu_init(struct cpufreq_policy *policy) else policy->policy = CPUFREQ_POLICY_POWERSAVE; - policy->min = cpu->pstate.min_pstate * 100000; - policy->max = cpu->pstate.turbo_pstate * 100000; + policy->min = cpu->pstate.min_pstate * cpu->pstate.scaling; + policy->max = cpu->pstate.turbo_pstate * cpu->pstate.scaling; /* cpuinfo and default policy values */ - policy->cpuinfo.min_freq = cpu->pstate.min_pstate * 100000; - policy->cpuinfo.max_freq = cpu->pstate.turbo_pstate * 100000; + policy->cpuinfo.min_freq = cpu->pstate.min_pstate * cpu->pstate.scaling; + policy->cpuinfo.max_freq = + cpu->pstate.turbo_pstate * cpu->pstate.scaling; policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; cpumask_set_cpu(policy->cpu, policy->cpus); @@ -866,6 +895,7 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs) pstate_funcs.get_max = funcs->get_max; pstate_funcs.get_min = funcs->get_min; pstate_funcs.get_turbo = funcs->get_turbo; + pstate_funcs.get_scaling = funcs->get_scaling; pstate_funcs.set = funcs->set; pstate_funcs.get_vid = funcs->get_vid; } -- cgit v1.2.3 From d022a65ed2473fac4a600e3424503dc571160a3e Mon Sep 17 00:00:00 2001 From: Dirk Brandewie Date: Mon, 13 Oct 2014 08:37:44 -0700 Subject: intel_pstate: Correct BYT VID values. Using a VID value that is not high enough for the requested P state can cause machine checks. Add a ceiling function to ensure calulated VIDs with fractional values are set to the next highest integer VID value. The algorythm for calculating the non-trubo VID from the BIOS writers guide is: vid_ratio = (vid_max - vid_min) / (max_pstate - min_pstate) vid = ceiling(vid_min + (req_pstate - min_pstate) * vid_ratio) Cc: All applicable Signed-off-by: Dirk Brandewie Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 98593e0344c5..27bb6d3877ed 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -52,6 +52,17 @@ static inline int32_t div_fp(int32_t x, int32_t y) return div_s64((int64_t)x << FRAC_BITS, y); } +static inline int ceiling_fp(int32_t x) +{ + int mask, ret; + + ret = fp_toint(x); + mask = (1 << FRAC_BITS) - 1; + if (x & mask) + ret += 1; + return ret; +} + struct sample { int32_t core_pct_busy; u64 aperf; @@ -425,7 +436,7 @@ static void byt_set_pstate(struct cpudata *cpudata, int pstate) cpudata->vid.ratio); vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max); - vid = fp_toint(vid_fp); + vid = ceiling_fp(vid_fp); if (pstate > cpudata->pstate.max_pstate) vid = cpudata->vid.turbo; -- cgit v1.2.3 From 4aa7c6346be395bdf776f82bbb2e3e2bc60bdd2b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: vfs: add i_op->dentry_open() Add a new inode operation i_op->dentry_open(). This is for stacked filesystems that want to return a struct file from a different filesystem. Signed-off-by: Miklos Szeredi --- Documentation/filesystems/Locking | 2 ++ Documentation/filesystems/vfs.txt | 7 +++++++ fs/namei.c | 9 ++++++--- fs/open.c | 23 +++++++++++++++++++++-- include/linux/fs.h | 4 ++++ 5 files changed, 40 insertions(+), 5 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 94d93b1f8b53..b30753cbf431 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -67,6 +67,7 @@ prototypes: struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); locking rules: all may block @@ -96,6 +97,7 @@ fiemap: no update_time: no atomic_open: yes tmpfile: no +dentry_open: no Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on victim. diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index fceff7c00a3c..20bf204426ca 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -364,6 +364,7 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); }; Again, all methods are called without any locks being held, unless @@ -696,6 +697,12 @@ struct address_space_operations { but instead uses bmap to find out where the blocks in the file are and uses those addresses directly. + dentry_open: *WARNING: probably going away soon, do not use!* This is an + alternative to f_op->open(), the difference is that this method may open + a file not necessarily originating from the same filesystem as the one + i_op->open() was called on. It may be useful for stacking filesystems + which want to allow native I/O directly on underlying files. + invalidatepage: If a page has PagePrivate set, then invalidatepage will be called when part or all of the page is to be removed diff --git a/fs/namei.c b/fs/namei.c index 43927d14db67..75306b3c9526 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3064,9 +3064,12 @@ finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); - if (error) { + + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ + error = vfs_open(&nd->path, file, current_cred()); + if (!error) { + *opened |= FILE_OPENED; + } else { if (error == -EOPENSTALE) goto stale_open; goto out; diff --git a/fs/open.c b/fs/open.c index d6fd3acde134..de92c13b58be 100644 --- a/fs/open.c +++ b/fs/open.c @@ -823,8 +823,7 @@ struct file *dentry_open(const struct path *path, int flags, f = get_empty_filp(); if (!IS_ERR(f)) { f->f_flags = flags; - f->f_path = *path; - error = do_dentry_open(f, NULL, cred); + error = vfs_open(path, f, cred); if (!error) { /* from now on we need fput() to dispose of f */ error = open_check_o_direct(f); @@ -841,6 +840,26 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * vfs_open - open the file at the given path + * @path: path to open + * @filp: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *filp, + const struct cred *cred) +{ + struct inode *inode = path->dentry->d_inode; + + if (inode->i_op->dentry_open) + return inode->i_op->dentry_open(path->dentry, filp, cred); + else { + filp->f_path = *path; + return do_dentry_open(filp, NULL, cred); + } +} +EXPORT_SYMBOL(vfs_open); + static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index a957d4366c24..5cf7f6759679 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1528,6 +1528,9 @@ struct inode_operations { umode_t create_mode, int *opened); int (*tmpfile) (struct inode *, struct dentry *, umode_t); int (*set_acl)(struct inode *, struct posix_acl *, int); + + /* WARNING: probably going away soon, do not use! */ + int (*dentry_open)(struct dentry *, struct file *, const struct cred *); } ____cacheline_aligned; ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, @@ -2040,6 +2043,7 @@ extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(struct dentry *, struct vfsmount *, const char *, int); +extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file * dentry_open(const struct path *, int, const struct cred *); extern int filp_close(struct file *, fl_owner_t id); -- cgit v1.2.3 From 1c118596a7682912106c80007102ce0184c77780 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: vfs: export do_splice_direct() to modules Export do_splice_direct() to modules. Needed by overlay filesystem. Signed-off-by: Miklos Szeredi --- fs/internal.h | 6 ------ fs/splice.c | 1 + include/linux/fs.h | 3 +++ 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/internal.h b/fs/internal.h index 9477f8f6aefc..0f0626a6997c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -138,12 +138,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, */ extern int rw_verify_area(int, struct file *, const loff_t *, size_t); -/* - * splice.c - */ -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); - /* * pipe.c */ diff --git a/fs/splice.c b/fs/splice.c index f5cb9ba84510..75c6058eabf2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, return ret; } +EXPORT_SYMBOL(do_splice_direct); static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, diff --git a/include/linux/fs.h b/include/linux/fs.h index 5cf7f6759679..10ed65b2c31d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2456,6 +2456,9 @@ extern ssize_t iter_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *, size_t len, unsigned int flags); +extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); + extern void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); -- cgit v1.2.3 From bd5d08569cc379f8366663a61558a9ce17c2e460 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:35 +0200 Subject: vfs: export __inode_permission() to modules We need to be able to check inode permissions (but not filesystem implied permissions) for stackable filesystems. Expose this interface for overlayfs. Signed-off-by: Miklos Szeredi --- fs/internal.h | 1 - fs/namei.c | 1 + include/linux/fs.h | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/internal.h b/fs/internal.h index 0f0626a6997c..757ba2abf21e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -47,7 +47,6 @@ extern void __init chrdev_init(void); /* * namei.c */ -extern int __inode_permission(struct inode *, int); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); diff --git a/fs/namei.c b/fs/namei.c index 75306b3c9526..d944f6db9b07 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -416,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask) return security_inode_permission(inode, mask); } +EXPORT_SYMBOL(__inode_permission); /** * sb_permission - Check superblock-level permissions diff --git a/include/linux/fs.h b/include/linux/fs.h index 10ed65b2c31d..5419df70a835 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2257,6 +2257,7 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); +extern int __inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); static inline bool execute_ok(struct inode *inode) -- cgit v1.2.3 From c771d683a62e5d36bc46036f5c07f4f5bb7dda61 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: vfs: introduce clone_private_mount() Overlayfs needs a private clone of the mount, so create a function for this and export to modules. Signed-off-by: Miklos Szeredi --- fs/namespace.c | 27 +++++++++++++++++++++++++++ include/linux/mount.h | 3 +++ 2 files changed, 30 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index fbba8b17330d..5b66b2b3624d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1686,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } +/** + * clone_private_mount - create a private clone of a path + * + * This creates a new vfsmount, which will be the clone of @path. The new will + * not be attached anywhere in the namespace and will be private (i.e. changes + * to the originating mount won't be propagated into this). + * + * Release with mntput(). + */ +struct vfsmount *clone_private_mount(struct path *path) +{ + struct mount *old_mnt = real_mount(path->mnt); + struct mount *new_mnt; + + if (IS_MNT_UNBINDABLE(old_mnt)) + return ERR_PTR(-EINVAL); + + down_read(&namespace_sem); + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) + return ERR_CAST(new_mnt); + + return &new_mnt->mnt; +} +EXPORT_SYMBOL_GPL(clone_private_mount); + int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { diff --git a/include/linux/mount.h b/include/linux/mount.h index 9262e4bf0cc3..c2c561dc0114 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -81,6 +81,9 @@ extern struct vfsmount *mntget(struct vfsmount *mnt); extern struct vfsmount *mnt_clone_internal(struct path *path); extern int __mnt_is_readonly(struct vfsmount *mnt); +struct path; +extern struct vfsmount *clone_private_mount(struct path *path); + struct file_system_type; extern struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, -- cgit v1.2.3 From cbdf35bcb833bfd00f0925d7a9a33a21f41ea582 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: vfs: export check_sticky() It's already duplicated in btrfs and about to be used in overlayfs too. Move the sticky bit check to an inline helper and call the out-of-line helper only in the unlikly case of the sticky bit being set. Signed-off-by: Miklos Szeredi --- fs/btrfs/ioctl.c | 20 +------------------- fs/namei.c | 9 ++------- include/linux/fs.h | 9 +++++++++ 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8d2b76e29d3b..4399f0c3a4ce 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -765,23 +765,6 @@ out: return ret; } -/* copy of check_sticky in fs/namei.c() -* It's inline, so penalty for filesystems that don't use sticky bit is -* minimal. -*/ -static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) -{ - kuid_t fsuid = current_fsuid(); - - if (!(dir->i_mode & S_ISVTX)) - return 0; - if (uid_eq(inode->i_uid, fsuid)) - return 0; - if (uid_eq(dir->i_uid, fsuid)) - return 0; - return !capable(CAP_FOWNER); -} - /* copy of may_delete in fs/namei.c() * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. @@ -817,8 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (btrfs_check_sticky(dir, victim->d_inode)|| - IS_APPEND(victim->d_inode)|| + if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) return -EPERM; if (isdir) { diff --git a/fs/namei.c b/fs/namei.c index d944f6db9b07..77fd536106cb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2384,22 +2384,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path, } EXPORT_SYMBOL(kern_path_mountpoint); -/* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. - */ -static inline int check_sticky(struct inode *dir, struct inode *inode) +int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (!(dir->i_mode & S_ISVTX)) - return 0; if (uid_eq(inode->i_uid, fsuid)) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } +EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check diff --git a/include/linux/fs.h b/include/linux/fs.h index 5419df70a835..55cc0a319baa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2259,6 +2259,7 @@ extern int notify_change(struct dentry *, struct iattr *, struct inode **); extern int inode_permission(struct inode *, int); extern int __inode_permission(struct inode *, int); extern int generic_permission(struct inode *, int); +extern int __check_sticky(struct inode *dir, struct inode *inode); static inline bool execute_ok(struct inode *inode) { @@ -2745,6 +2746,14 @@ static inline int is_sxid(umode_t mode) return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP)); } +static inline int check_sticky(struct inode *dir, struct inode *inode) +{ + if (!(dir->i_mode & S_ISVTX)) + return 0; + + return __check_sticky(dir, inode); +} + static inline void inode_has_no_xattr(struct inode *inode) { if (!is_sxid(inode->i_mode) && (inode->i_sb->s_flags & MS_NOSEC)) -- cgit v1.2.3 From 787fb6bc9682ec7c05fb5d9561b57100fbc1cc41 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:36 +0200 Subject: vfs: add whiteout support Whiteout isn't actually a new file type, but is represented as a char device (Linus's idea) with 0/0 device number. This has several advantages compared to introducing a new whiteout file type: - no userspace API changes (e.g. trivial to make backups of upper layer filesystem, without losing whiteouts) - no fs image format changes (you can boot an old kernel/fsck without whiteout support and things won't break) - implementation is trivial Signed-off-by: Miklos Szeredi --- fs/namei.c | 14 ++++++++++++++ include/linux/fs.h | 11 +++++++++++ 2 files changed, 25 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 77fd536106cb..d20191c0ebf5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4346,6 +4346,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } +int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + int error = may_create(dir, dentry); + if (error) + return error; + + if (!dir->i_op->mknod) + return -EPERM; + + return dir->i_op->mknod(dir, dentry, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} +EXPORT_SYMBOL(vfs_whiteout); + int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/include/linux/fs.h b/include/linux/fs.h index 55cc0a319baa..69118b3cb917 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -222,6 +222,13 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) +/* + * Whiteout is represented by a char device. The following constants define the + * mode and device number to use. + */ +#define WHITEOUT_MODE 0 +#define WHITEOUT_DEV 0 + /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. @@ -1398,6 +1405,7 @@ extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct ino extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); +extern int vfs_whiteout(struct inode *, struct dentry *); /* * VFS dentry helper functions. @@ -1628,6 +1636,9 @@ struct super_operations { #define IS_AUTOMOUNT(inode) ((inode)->i_flags & S_AUTOMOUNT) #define IS_NOSEC(inode) ((inode)->i_flags & S_NOSEC) +#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \ + (inode)->i_rdev == WHITEOUT_DEV) + /* * Inode state bits. Protected by inode->i_lock * -- cgit v1.2.3 From 0d7a855526dd672e114aff2ac22b60fc6f155b08 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:37 +0200 Subject: vfs: add RENAME_WHITEOUT This adds a new RENAME_WHITEOUT flag. This flag makes rename() create a whiteout of source. The whiteout creation is atomic relative to the rename. Signed-off-by: Miklos Szeredi --- fs/namei.c | 8 ++++++-- include/uapi/linux/fs.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index d20191c0ebf5..42df664e95e5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -4209,12 +4209,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, bool should_retry = false; int error; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) + if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && + (flags & RENAME_EXCHANGE)) return -EINVAL; + if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) + return -EPERM; + retry: from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); if (IS_ERR(from)) { diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index ca1a11bb4443..3735fa0a6784 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -37,6 +37,7 @@ #define RENAME_NOREPLACE (1 << 0) /* Don't overwrite target */ #define RENAME_EXCHANGE (1 << 1) /* Exchange source and dest */ +#define RENAME_WHITEOUT (1 << 2) /* Whiteout source */ struct fstrim_range { __u64 start; -- cgit v1.2.3 From cd808deced431b66b5fa4e5c193cb7ec0059eaff Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:37 +0200 Subject: ext4: support RENAME_WHITEOUT Add whiteout support to ext4_rename(). A whiteout inode (chrdev/0,0) is created before the rename takes place. The whiteout inode is added to the old entry instead of deleting it. Signed-off-by: Miklos Szeredi --- fs/ext4/namei.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 78 insertions(+), 17 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 603e4ebbd0ac..aba86e8ef1ef 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3190,6 +3190,39 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) } } +static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent, + int credits, handle_t **h) +{ + struct inode *wh; + handle_t *handle; + int retries = 0; + + /* + * for inode block, sb block, group summaries, + * and inode bitmap + */ + credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS + 4); +retry: + wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE, + &ent->dentry->d_name, 0, NULL, + EXT4_HT_DIR, credits); + + handle = ext4_journal_current_handle(); + if (IS_ERR(wh)) { + if (handle) + ext4_journal_stop(handle); + if (PTR_ERR(wh) == -ENOSPC && + ext4_should_retry_alloc(ent->dir->i_sb, &retries)) + goto retry; + } else { + *h = handle; + init_special_inode(wh, wh->i_mode, WHITEOUT_DEV); + wh->i_op = &ext4_special_inode_operations; + } + return wh; +} + /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. @@ -3199,7 +3232,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { handle_t *handle = NULL; struct ext4_renament old = { @@ -3214,6 +3248,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, }; int force_reread; int retval; + struct inode *whiteout = NULL; + int credits; + u8 old_file_type; dquot_initialize(old.dir); dquot_initialize(new.dir); @@ -3252,11 +3289,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) ext4_alloc_da_blocks(old.inode); - handle = ext4_journal_start(old.dir, EXT4_HT_DIR, - (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); - if (IS_ERR(handle)) - return PTR_ERR(handle); + credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (!(flags & RENAME_WHITEOUT)) { + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } else { + whiteout = ext4_whiteout_for_rename(&old, credits, &handle); + if (IS_ERR(whiteout)) + return PTR_ERR(whiteout); + } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); @@ -3284,13 +3327,26 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, */ force_reread = (new.dir->i_ino == old.dir->i_ino && ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); + + old_file_type = old.de->file_type; + if (whiteout) { + /* + * Do this before adding a new entry, so the old entry is sure + * to be still pointing to the valid old entry. + */ + retval = ext4_setent(handle, &old, whiteout->i_ino, + EXT4_FT_CHRDEV); + if (retval) + goto end_rename; + ext4_mark_inode_dirty(handle, whiteout); + } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) goto end_rename; } else { retval = ext4_setent(handle, &new, - old.inode->i_ino, old.de->file_type); + old.inode->i_ino, old_file_type); if (retval) goto end_rename; } @@ -3305,10 +3361,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, old.inode->i_ctime = ext4_current_time(old.inode); ext4_mark_inode_dirty(handle, old.inode); - /* - * ok, that's it - */ - ext4_rename_delete(handle, &old, force_reread); + if (!whiteout) { + /* + * ok, that's it + */ + ext4_rename_delete(handle, &old, force_reread); + } if (new.inode) { ext4_dec_count(handle, new.inode); @@ -3344,6 +3402,12 @@ end_rename: brelse(old.dir_bh); brelse(old.bh); brelse(new.bh); + if (whiteout) { + if (retval) + drop_nlink(whiteout); + unlock_new_inode(whiteout); + iput(whiteout); + } if (handle) ext4_journal_stop(handle); return retval; @@ -3476,18 +3540,15 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) { return ext4_cross_rename(old_dir, old_dentry, new_dir, new_dentry); } - /* - * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE" - * is equivalent to regular rename. - */ - return ext4_rename(old_dir, old_dentry, new_dir, new_dentry); + + return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } /* -- cgit v1.2.3 From 46fdb794e3f52ef18b859ebc92f0a9d7db21c5df Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:37 +0200 Subject: shmem: support RENAME_WHITEOUT Allocate a dentry, initialize it with a whiteout and hash it in the place of the old dentry. Later the old dentry will be moved away and the whiteout will remain. i_mutex protects agains concurrent readdir. Signed-off-by: Miklos Szeredi Cc: Hugh Dickins --- mm/shmem.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index cd6fc7590e54..185836ba53ef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2345,6 +2345,32 @@ static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, stru return 0; } +static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) +{ + struct dentry *whiteout; + int error; + + whiteout = d_alloc(old_dentry->d_parent, &old_dentry->d_name); + if (!whiteout) + return -ENOMEM; + + error = shmem_mknod(old_dir, whiteout, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); + dput(whiteout); + if (error) + return error; + + /* + * Cheat and hash the whiteout while the old dentry is still in + * place, instead of playing games with FS_RENAME_DOES_D_MOVE. + * + * d_lookup() will consistently find one of them at this point, + * not sure which one, but that isn't even important. + */ + d_rehash(whiteout); + return 0; +} + /* * The VFS layer already does all the dentry stuff for rename, * we just have to decrement the usage count for the target if @@ -2356,7 +2382,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc struct inode *inode = old_dentry->d_inode; int they_are_dirs = S_ISDIR(inode->i_mode); - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) @@ -2365,6 +2391,14 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc if (!simple_empty(new_dentry)) return -ENOTEMPTY; + if (flags & RENAME_WHITEOUT) { + int error; + + error = shmem_whiteout(old_dir, old_dentry); + if (error) + return error; + } + if (new_dentry->d_inode) { (void) shmem_unlink(new_dir, new_dentry); if (they_are_dirs) { -- cgit v1.2.3 From e9be9d5e76e34872f0c37d72e25bc27fe9e2c54c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:38 +0200 Subject: overlay filesystem Overlayfs allows one, usually read-write, directory tree to be overlaid onto another, read-only directory tree. All modifications go to the upper, writable layer. This type of mechanism is most often used for live CDs but there's a wide variety of other uses. The implementation differs from other "union filesystem" implementations in that after a file is opened all operations go directly to the underlying, lower or upper, filesystems. This simplifies the implementation and allows native performance in these cases. The dentry tree is duplicated from the underlying filesystems, this enables fast cached lookups without adding special support into the VFS. This uses slightly more memory than union mounts, but dentries are relatively small. Currently inodes are duplicated as well, but it is a possible optimization to share inodes for non-directories. Opening non directories results in the open forwarded to the underlying filesystem. This makes the behavior very similar to union mounts (with the same limitations vs. fchmod/fchown on O_RDONLY file descriptors). Usage: mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper/upper,workdir=/upper/work /overlay The following cotributions have been folded into this patch: Neil Brown : - minimal remount support - use correct seek function for directories - initialise is_real before use - rename ovl_fill_cache to ovl_dir_read Felix Fietkau : - fix a deadlock in ovl_dir_read_merged - fix a deadlock in ovl_remove_whiteouts Erez Zadok - fix cleanup after WARN_ON Sedat Dilek - fix up permission to confirm to new API Robin Dong - fix possible leak in ovl_new_inode - create new inode in ovl_link Andy Whitcroft - switch to __inode_permission() - copy up i_uid/i_gid from the underlying inode AV: - ovl_copy_up_locked() - dput(ERR_PTR(...)) on two failure exits - ovl_clear_empty() - one failure exit forgetting to do unlock_rename(), lack of check for udir being the parent of upper, dropping and regaining the lock on udir (which would require _another_ check for parent being right). - bogus d_drop() in copyup and rename [fix from your mail] - copyup/remove and copyup/rename races [fix from your mail] - ovl_dir_fsync() leaving ERR_PTR() in ->realfile - ovl_entry_free() is pointless - it's just a kfree_rcu() - fold ovl_do_lookup() into ovl_lookup() - manually assigning ->d_op is wrong. Just use ->s_d_op. [patches picked from Miklos]: * copyup/remove and copyup/rename races * bogus d_drop() in copyup and rename Also thanks to the following people for testing and reporting bugs: Jordi Pujol Andy Whitcroft Michal Suchanek Felix Fietkau Erez Zadok Randy Dunlap Signed-off-by: Miklos Szeredi --- fs/Kconfig | 1 + fs/Makefile | 1 + fs/overlayfs/Kconfig | 10 + fs/overlayfs/Makefile | 7 + fs/overlayfs/copy_up.c | 414 +++++++++++++++++++++ fs/overlayfs/dir.c | 921 +++++++++++++++++++++++++++++++++++++++++++++++ fs/overlayfs/inode.c | 425 ++++++++++++++++++++++ fs/overlayfs/overlayfs.h | 191 ++++++++++ fs/overlayfs/readdir.c | 587 ++++++++++++++++++++++++++++++ fs/overlayfs/super.c | 727 +++++++++++++++++++++++++++++++++++++ 10 files changed, 3284 insertions(+) create mode 100644 fs/overlayfs/Kconfig create mode 100644 fs/overlayfs/Makefile create mode 100644 fs/overlayfs/copy_up.c create mode 100644 fs/overlayfs/dir.c create mode 100644 fs/overlayfs/inode.c create mode 100644 fs/overlayfs/overlayfs.h create mode 100644 fs/overlayfs/readdir.c create mode 100644 fs/overlayfs/super.c diff --git a/fs/Kconfig b/fs/Kconfig index db5dc1598716..664991afe0c0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -67,6 +67,7 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" +source "fs/overlayfs/Kconfig" menu "Caches" diff --git a/fs/Makefile b/fs/Makefile index 90c88529892b..34a1b9dea6dd 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_OMFS_FS) += omfs/ diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig new file mode 100644 index 000000000000..e60125976873 --- /dev/null +++ b/fs/overlayfs/Kconfig @@ -0,0 +1,10 @@ +config OVERLAYFS_FS + tristate "Overlay filesystem support" + help + An overlay filesystem combines two filesystems - an 'upper' filesystem + and a 'lower' filesystem. When a name exists in both filesystems, the + object in the 'upper' filesystem is visible while the object in the + 'lower' filesystem is either hidden or, in the case of directories, + merged with the 'upper' object. + + For more information see Documentation/filesystems/overlayfs.txt diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile new file mode 100644 index 000000000000..8f91889480d0 --- /dev/null +++ b/fs/overlayfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the overlay filesystem. +# + +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o + +overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c new file mode 100644 index 000000000000..ea10a8719107 --- /dev/null +++ b/fs/overlayfs/copy_up.c @@ -0,0 +1,414 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +#define OVL_COPY_UP_CHUNK_SIZE (1 << 20) + +int ovl_copy_xattr(struct dentry *old, struct dentry *new) +{ + ssize_t list_size, size; + char *buf, *name, *value; + int error; + + if (!old->d_inode->i_op->getxattr || + !new->d_inode->i_op->getxattr) + return 0; + + list_size = vfs_listxattr(old, NULL, 0); + if (list_size <= 0) { + if (list_size == -EOPNOTSUPP) + return 0; + return list_size; + } + + buf = kzalloc(list_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + error = -ENOMEM; + value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL); + if (!value) + goto out; + + list_size = vfs_listxattr(old, buf, list_size); + if (list_size <= 0) { + error = list_size; + goto out_free_value; + } + + for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX); + if (size <= 0) { + error = size; + goto out_free_value; + } + error = vfs_setxattr(new, name, value, size, 0); + if (error) + goto out_free_value; + } + +out_free_value: + kfree(value); +out: + kfree(buf); + return error; +} + +static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) +{ + struct file *old_file; + struct file *new_file; + loff_t old_pos = 0; + loff_t new_pos = 0; + int error = 0; + + if (len == 0) + return 0; + + old_file = ovl_path_open(old, O_RDONLY); + if (IS_ERR(old_file)) + return PTR_ERR(old_file); + + new_file = ovl_path_open(new, O_WRONLY); + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto out_fput; + } + + /* FIXME: copy up sparse files efficiently */ + while (len) { + size_t this_len = OVL_COPY_UP_CHUNK_SIZE; + long bytes; + + if (len < this_len) + this_len = len; + + if (signal_pending_state(TASK_KILLABLE, current)) { + error = -EINTR; + break; + } + + bytes = do_splice_direct(old_file, &old_pos, + new_file, &new_pos, + this_len, SPLICE_F_MOVE); + if (bytes <= 0) { + error = bytes; + break; + } + WARN_ON(old_pos != new_pos); + + len -= bytes; + } + + fput(new_file); +out_fput: + fput(old_file); + return error; +} + +static char *ovl_read_symlink(struct dentry *realdentry) +{ + int res; + char *buf; + struct inode *inode = realdentry->d_inode; + mm_segment_t old_fs; + + res = -EINVAL; + if (!inode->i_op->readlink) + goto err; + + res = -ENOMEM; + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) + goto err; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = inode->i_op->readlink(realdentry, + (char __user *)buf, PAGE_SIZE - 1); + set_fs(old_fs); + if (res < 0) { + free_page((unsigned long) buf); + goto err; + } + buf[res] = '\0'; + + return buf; + +err: + return ERR_PTR(res); +} + +static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + .ia_atime = stat->atime, + .ia_mtime = stat->mtime, + }; + + return notify_change(upperdentry, &attr, NULL); +} + +int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) +{ + int err = 0; + + if (!S_ISLNK(stat->mode)) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat->mode, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) { + struct iattr attr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = stat->uid, + .ia_gid = stat->gid, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) + ovl_set_timestamps(upperdentry, stat); + + return err; + +} + +static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, + struct dentry *dentry, struct path *lowerpath, + struct kstat *stat, struct iattr *attr, + const char *link) +{ + struct inode *wdir = workdir->d_inode; + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry = NULL; + struct dentry *upper = NULL; + umode_t mode = stat->mode; + int err; + + newdentry = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out1; + + /* Can't properly set mode on creation because of the umask */ + stat->mode &= S_IFMT; + err = ovl_create_real(wdir, newdentry, stat, link, NULL, true); + stat->mode = mode; + if (err) + goto out2; + + if (S_ISREG(stat->mode)) { + struct path upperpath; + ovl_path_upper(dentry, &upperpath); + BUG_ON(upperpath.dentry != NULL); + upperpath.dentry = newdentry; + + err = ovl_copy_up_data(lowerpath, &upperpath, stat->size); + if (err) + goto out_cleanup; + } + + err = ovl_copy_xattr(lowerpath->dentry, newdentry); + if (err) + goto out_cleanup; + + mutex_lock(&newdentry->d_inode->i_mutex); + err = ovl_set_attr(newdentry, stat); + if (!err && attr) + err = notify_change(newdentry, attr, NULL); + mutex_unlock(&newdentry->d_inode->i_mutex); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + + ovl_dentry_update(dentry, newdentry); + newdentry = NULL; + + /* + * Non-directores become opaque when copied up. + */ + if (!S_ISDIR(stat->mode)) + ovl_dentry_set_opaque(dentry, true); +out2: + dput(upper); +out1: + dput(newdentry); +out: + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + goto out; +} + +/* + * Copy up a single dentry + * + * Directory renames only allowed on "pure upper" (already created on + * upper filesystem, never copied up). Directories which are on lower or + * are merged may not be renamed. For these -EXDEV is returned and + * userspace has to deal with it. This means, when copying up a + * directory we can rely on it and ancestors being stable. + * + * Non-directory renames start with copy up of source if necessary. The + * actual rename will only proceed once the copy up was successful. Copy + * up uses upper parent i_mutex for exclusion. Since rename can change + * d_parent it is possible that the copy up will lock the old parent. At + * that point the file will have already been copied up anyway. + */ +int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + struct iattr *attr) +{ + struct dentry *workdir = ovl_workdir(dentry); + int err; + struct kstat pstat; + struct path parentpath; + struct dentry *upperdir; + struct dentry *upperdentry; + const struct cred *old_cred; + struct cred *override_cred; + char *link = NULL; + + ovl_path_upper(parent, &parentpath); + upperdir = parentpath.dentry; + + err = vfs_getattr(&parentpath, &pstat); + if (err) + return err; + + if (S_ISLNK(stat->mode)) { + link = ovl_read_symlink(lowerpath->dentry); + if (IS_ERR(link)) + return PTR_ERR(link); + } + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_free_link; + + override_cred->fsuid = stat->uid; + override_cred->fsgid = stat->gid; + /* + * CAP_SYS_ADMIN for copying up extended attributes + * CAP_DAC_OVERRIDE for create + * CAP_FOWNER for chmod, timestamp update + * CAP_FSETID for chmod + * CAP_CHOWN for chown + * CAP_MKNOD for mknod + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + cap_raise(override_cred->cap_effective, CAP_MKNOD); + old_cred = override_creds(override_cred); + + err = -EIO; + if (lock_rename(workdir, upperdir) != NULL) { + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + goto out_unlock; + } + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) { + unlock_rename(workdir, upperdir); + err = 0; + /* Raced with another copy-up? Do the setattr here */ + if (attr) { + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr, NULL); + mutex_unlock(&upperdentry->d_inode->i_mutex); + } + goto out_put_cred; + } + + err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath, + stat, attr, link); + if (!err) { + /* Restore timestamps on parent (best effort) */ + ovl_set_timestamps(upperdir, &pstat); + } +out_unlock: + unlock_rename(workdir, upperdir); +out_put_cred: + revert_creds(old_cred); + put_cred(override_cred); + +out_free_link: + if (link) + free_page((unsigned long) link); + + return err; +} + +int ovl_copy_up(struct dentry *dentry) +{ + int err; + + err = 0; + while (!err) { + struct dentry *next; + struct dentry *parent; + struct path lowerpath; + struct kstat stat; + enum ovl_path_type type = ovl_path_type(dentry); + + if (type != OVL_PATH_LOWER) + break; + + next = dget(dentry); + /* find the topmost dentry not yet copied up */ + for (;;) { + parent = dget_parent(next); + + type = ovl_path_type(parent); + if (type != OVL_PATH_LOWER) + break; + + dput(next); + next = parent; + } + + ovl_path_lower(next, &lowerpath); + err = vfs_getattr(&lowerpath, &stat); + if (!err) + err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL); + + dput(parent); + dput(next); + } + + return err; +} diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c new file mode 100644 index 000000000000..15cd91ad9940 --- /dev/null +++ b/fs/overlayfs/dir.c @@ -0,0 +1,921 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include "overlayfs.h" + +void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) +{ + int err; + + dget(wdentry); + if (S_ISDIR(wdentry->d_inode->i_mode)) + err = ovl_do_rmdir(wdir, wdentry); + else + err = ovl_do_unlink(wdir, wdentry); + dput(wdentry); + + if (err) { + pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n", + wdentry, err); + } +} + +struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) +{ + struct dentry *temp; + char name[20]; + + snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry); + + temp = lookup_one_len(name, workdir, strlen(name)); + if (!IS_ERR(temp) && temp->d_inode) { + pr_err("overlayfs: workdir/%s already exists\n", name); + dput(temp); + temp = ERR_PTR(-EIO); + } + + return temp; +} + +/* caller holds i_mutex on workdir */ +static struct dentry *ovl_whiteout(struct dentry *workdir, + struct dentry *dentry) +{ + int err; + struct dentry *whiteout; + struct inode *wdir = workdir->d_inode; + + whiteout = ovl_lookup_temp(workdir, dentry); + if (IS_ERR(whiteout)) + return whiteout; + + err = ovl_do_whiteout(wdir, whiteout); + if (err) { + dput(whiteout); + whiteout = ERR_PTR(err); + } + + return whiteout; +} + +int ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct kstat *stat, const char *link, + struct dentry *hardlink, bool debug) +{ + int err; + + if (newdentry->d_inode) + return -ESTALE; + + if (hardlink) { + err = ovl_do_link(hardlink, dir, newdentry, debug); + } else { + switch (stat->mode & S_IFMT) { + case S_IFREG: + err = ovl_do_create(dir, newdentry, stat->mode, debug); + break; + + case S_IFDIR: + err = ovl_do_mkdir(dir, newdentry, stat->mode, debug); + break; + + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ovl_do_mknod(dir, newdentry, + stat->mode, stat->rdev, debug); + break; + + case S_IFLNK: + err = ovl_do_symlink(dir, newdentry, link, debug); + break; + + default: + err = -EPERM; + } + } + if (!err && WARN_ON(!newdentry->d_inode)) { + /* + * Not quite sure if non-instantiated dentry is legal or not. + * VFS doesn't seem to care so check and warn here. + */ + err = -ENOENT; + } + return err; +} + +static int ovl_set_opaque(struct dentry *upperdentry) +{ + return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); +} + +static void ovl_remove_opaque(struct dentry *upperdentry) +{ + int err; + + err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr); + if (err) { + pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n", + upperdentry->d_name.name, err); + } +} + +static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + enum ovl_path_type type; + struct path realpath; + + type = ovl_path_real(dentry, &realpath); + err = vfs_getattr(&realpath, stat); + if (err) + return err; + + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (type == OVL_PATH_MERGE) + stat->nlink = 1; + + return 0; +} + +static int ovl_create_upper(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry; + int err; + + mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT); + newdentry = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + err = ovl_create_real(udir, newdentry, stat, link, hardlink, false); + if (err) + goto out_dput; + + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + ovl_copyattr(newdentry->d_inode, inode); + d_instantiate(dentry, inode); + newdentry = NULL; +out_dput: + dput(newdentry); +out_unlock: + mutex_unlock(&udir->i_mutex); + return err; +} + +static int ovl_lock_rename_workdir(struct dentry *workdir, + struct dentry *upperdir) +{ + /* Workdir should not be the same as upperdir */ + if (workdir == upperdir) + goto err; + + /* Workdir should not be subdir of upperdir and vice versa */ + if (lock_rename(workdir, upperdir) != NULL) + goto err_unlock; + + return 0; + +err_unlock: + unlock_rename(workdir, upperdir); +err: + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + return -EIO; +} + +static struct dentry *ovl_clear_empty(struct dentry *dentry, + struct list_head *list) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct path upperpath; + struct dentry *upper; + struct dentry *opaquedir; + struct kstat stat; + int err; + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + ovl_path_upper(dentry, &upperpath); + err = vfs_getattr(&upperpath, &stat); + if (err) + goto out_unlock; + + err = -ESTALE; + if (!S_ISDIR(stat.mode)) + goto out_unlock; + upper = upperpath.dentry; + if (upper->d_parent->d_inode != udir) + goto out_unlock; + + opaquedir = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out_unlock; + + err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true); + if (err) + goto out_dput; + + err = ovl_copy_xattr(upper, opaquedir); + if (err) + goto out_cleanup; + + err = ovl_set_opaque(opaquedir); + if (err) + goto out_cleanup; + + mutex_lock(&opaquedir->d_inode->i_mutex); + err = ovl_set_attr(opaquedir, &stat); + mutex_unlock(&opaquedir->d_inode->i_mutex); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup_whiteouts(upper, list); + ovl_cleanup(wdir, upper); + unlock_rename(workdir, upperdir); + + /* dentry's upper doesn't match now, get rid of it */ + d_drop(dentry); + + return opaquedir; + +out_cleanup: + ovl_cleanup(wdir, opaquedir); +out_dput: + dput(opaquedir); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return ERR_PTR(err); +} + +static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry, + enum ovl_path_type type) +{ + int err; + struct dentry *ret = NULL; + LIST_HEAD(list); + + err = ovl_check_empty_dir(dentry, &list); + if (err) + ret = ERR_PTR(err); + else if (type == OVL_PATH_MERGE) + ret = ovl_clear_empty(dentry, &list); + + ovl_cache_free(&list); + + return ret; +} + +static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *upper; + struct dentry *newdentry; + int err; + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + newdentry = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_dput; + + err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true); + if (err) + goto out_dput2; + + if (S_ISDIR(stat->mode)) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, + RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup(wdir, upper); + } else { + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + } + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + ovl_copyattr(newdentry->d_inode, inode); + d_instantiate(dentry, inode); + newdentry = NULL; +out_dput2: + dput(upper); +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + goto out_dput2; +} + +static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, + const char *link, struct dentry *hardlink) +{ + int err; + struct inode *inode; + struct kstat stat = { + .mode = mode, + .rdev = rdev, + }; + + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata); + if (!inode) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_iput; + + if (!ovl_dentry_is_opaque(dentry)) { + err = ovl_create_upper(dentry, inode, &stat, link, hardlink); + } else { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_iput; + + /* + * CAP_SYS_ADMIN for setting opaque xattr + * CAP_DAC_OVERRIDE for create in workdir, rename + * CAP_FOWNER for removing whiteout from sticky dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + old_cred = override_creds(override_cred); + + err = ovl_create_over_whiteout(dentry, inode, &stat, link, + hardlink); + + revert_creds(old_cred); + put_cred(override_cred); + } + + if (!err) + inode = NULL; +out_iput: + iput(inode); +out: + return err; +} + +static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, + const char *link) +{ + int err; + + err = ovl_want_write(dentry); + if (!err) { + err = ovl_create_or_link(dentry, mode, rdev, link, NULL); + ovl_drop_write(dentry); + } + + return err; +} + +static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); +} + +static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); +} + +static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + /* Don't allow creation of "whiteout" on overlay */ + if (S_ISCHR(mode) && rdev == WHITEOUT_DEV) + return -EPERM; + + return ovl_create_object(dentry, mode, rdev, NULL); +} + +static int ovl_symlink(struct inode *dir, struct dentry *dentry, + const char *link) +{ + return ovl_create_object(dentry, S_IFLNK, 0, link); +} + +static int ovl_link(struct dentry *old, struct inode *newdir, + struct dentry *new) +{ + int err; + struct dentry *upper; + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + upper = ovl_dentry_upper(old); + err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper); + +out_drop_write: + ovl_drop_write(old); +out: + return err; +} + +static int ovl_remove_and_whiteout(struct dentry *dentry, + enum ovl_path_type type, bool is_dir) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *whiteout; + struct dentry *upper; + struct dentry *opaquedir = NULL; + int err; + + if (is_dir) { + opaquedir = ovl_check_empty_and_clear(dentry, type); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out_dput; + + whiteout = ovl_whiteout(workdir, dentry); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto out_unlock; + + if (type == OVL_PATH_LOWER) { + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto kill_whiteout; + + err = ovl_do_rename(wdir, whiteout, udir, upper, 0); + dput(upper); + if (err) + goto kill_whiteout; + } else { + int flags = 0; + + upper = ovl_dentry_upper(dentry); + if (opaquedir) + upper = opaquedir; + err = -ESTALE; + if (upper->d_parent != upperdir) + goto kill_whiteout; + + if (is_dir) + flags |= RENAME_EXCHANGE; + + err = ovl_do_rename(wdir, whiteout, udir, upper, flags); + if (err) + goto kill_whiteout; + + if (is_dir) + ovl_cleanup(wdir, upper); + } + ovl_dentry_version_inc(dentry->d_parent); +out_d_drop: + d_drop(dentry); + dput(whiteout); +out_unlock: + unlock_rename(workdir, upperdir); +out_dput: + dput(opaquedir); +out: + return err; + +kill_whiteout: + ovl_cleanup(wdir, whiteout); + goto out_d_drop; +} + +static int ovl_remove_upper(struct dentry *dentry, bool is_dir) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *dir = upperdir->d_inode; + struct dentry *upper = ovl_dentry_upper(dentry); + int err; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + err = -ESTALE; + if (upper->d_parent == upperdir) { + /* Don't let d_delete() think it can reset d_inode */ + dget(upper); + if (is_dir) + err = vfs_rmdir(dir, upper); + else + err = vfs_unlink(dir, upper, NULL); + dput(upper); + ovl_dentry_version_inc(dentry->d_parent); + } + + /* + * Keeping this dentry hashed would mean having to release + * upperpath/lowerpath, which could only be done if we are the + * sole user of this dentry. Too tricky... Just unhash for + * now. + */ + d_drop(dentry); + mutex_unlock(&dir->i_mutex); + + return err; +} + +static inline int ovl_check_sticky(struct dentry *dentry) +{ + struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode; + struct inode *inode = ovl_dentry_real(dentry)->d_inode; + + if (check_sticky(dir, inode)) + return -EPERM; + + return 0; +} + +static int ovl_do_remove(struct dentry *dentry, bool is_dir) +{ + enum ovl_path_type type; + int err; + + err = ovl_check_sticky(dentry); + if (err) + goto out; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_drop_write; + + type = ovl_path_type(dentry); + if (type == OVL_PATH_PURE_UPPER) { + err = ovl_remove_upper(dentry, is_dir); + } else { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_drop_write; + + /* + * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir + * CAP_DAC_OVERRIDE for create in workdir, rename + * CAP_FOWNER for removing whiteout from sticky dir + * CAP_FSETID for chmod of opaque dir + * CAP_CHOWN for chown of opaque dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + old_cred = override_creds(override_cred); + + err = ovl_remove_and_whiteout(dentry, type, is_dir); + + revert_creds(old_cred); + put_cred(override_cred); + } +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_unlink(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, false); +} + +static int ovl_rmdir(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, true); +} + +static int ovl_rename2(struct inode *olddir, struct dentry *old, + struct inode *newdir, struct dentry *new, + unsigned int flags) +{ + int err; + enum ovl_path_type old_type; + enum ovl_path_type new_type; + struct dentry *old_upperdir; + struct dentry *new_upperdir; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *trap; + bool old_opaque; + bool new_opaque; + bool new_create = false; + bool cleanup_whiteout = false; + bool overwrite = !(flags & RENAME_EXCHANGE); + bool is_dir = S_ISDIR(old->d_inode->i_mode); + bool new_is_dir = false; + struct dentry *opaquedir = NULL; + const struct cred *old_cred = NULL; + struct cred *override_cred = NULL; + + err = -EINVAL; + if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) + goto out; + + flags &= ~RENAME_NOREPLACE; + + err = ovl_check_sticky(old); + if (err) + goto out; + + /* Don't copy up directory trees */ + old_type = ovl_path_type(old); + err = -EXDEV; + if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir) + goto out; + + if (new->d_inode) { + err = ovl_check_sticky(new); + if (err) + goto out; + + if (S_ISDIR(new->d_inode->i_mode)) + new_is_dir = true; + + new_type = ovl_path_type(new); + err = -EXDEV; + if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) + goto out; + + err = 0; + if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) { + if (ovl_dentry_lower(old)->d_inode == + ovl_dentry_lower(new)->d_inode) + goto out; + } + if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) { + if (ovl_dentry_upper(old)->d_inode == + ovl_dentry_upper(new)->d_inode) + goto out; + } + } else { + if (ovl_dentry_is_opaque(new)) + new_type = OVL_PATH_UPPER; + else + new_type = OVL_PATH_PURE_UPPER; + } + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + if (!overwrite) { + err = ovl_copy_up(new); + if (err) + goto out_drop_write; + } + + old_opaque = old_type != OVL_PATH_PURE_UPPER; + new_opaque = new_type != OVL_PATH_PURE_UPPER; + + if (old_opaque || new_opaque) { + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_drop_write; + + /* + * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir + * CAP_DAC_OVERRIDE for create in workdir + * CAP_FOWNER for removing whiteout from sticky dir + * CAP_FSETID for chmod of opaque dir + * CAP_CHOWN for chown of opaque dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + old_cred = override_creds(override_cred); + } + + if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) { + opaquedir = ovl_check_empty_and_clear(new, new_type); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) { + opaquedir = NULL; + goto out_revert_creds; + } + } + + if (overwrite) { + if (old_opaque) { + if (new->d_inode || !new_opaque) { + /* Whiteout source */ + flags |= RENAME_WHITEOUT; + } else { + /* Switch whiteouts */ + flags |= RENAME_EXCHANGE; + } + } else if (is_dir && !new->d_inode && new_opaque) { + flags |= RENAME_EXCHANGE; + cleanup_whiteout = true; + } + } + + old_upperdir = ovl_dentry_upper(old->d_parent); + new_upperdir = ovl_dentry_upper(new->d_parent); + + trap = lock_rename(new_upperdir, old_upperdir); + + olddentry = ovl_dentry_upper(old); + newdentry = ovl_dentry_upper(new); + if (newdentry) { + if (opaquedir) { + newdentry = opaquedir; + opaquedir = NULL; + } else { + dget(newdentry); + } + } else { + new_create = true; + newdentry = lookup_one_len(new->d_name.name, new_upperdir, + new->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + } + + err = -ESTALE; + if (olddentry->d_parent != old_upperdir) + goto out_dput; + if (newdentry->d_parent != new_upperdir) + goto out_dput; + if (olddentry == trap) + goto out_dput; + if (newdentry == trap) + goto out_dput; + + if (is_dir && !old_opaque && new_opaque) { + err = ovl_set_opaque(olddentry); + if (err) + goto out_dput; + } + if (!overwrite && new_is_dir && old_opaque && !new_opaque) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_dput; + } + + if (old_opaque || new_opaque) { + err = ovl_do_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, + flags); + } else { + /* No debug for the plain case */ + BUG_ON(flags & ~RENAME_EXCHANGE); + err = vfs_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, + NULL, flags); + } + + if (err) { + if (is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(olddentry); + if (!overwrite && new_is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(newdentry); + goto out_dput; + } + + if (is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(olddentry); + if (!overwrite && new_is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(newdentry); + + if (old_opaque != new_opaque) { + ovl_dentry_set_opaque(old, new_opaque); + if (!overwrite) + ovl_dentry_set_opaque(new, old_opaque); + } + + if (cleanup_whiteout) + ovl_cleanup(old_upperdir->d_inode, newdentry); + + ovl_dentry_version_inc(old->d_parent); + ovl_dentry_version_inc(new->d_parent); + +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(new_upperdir, old_upperdir); +out_revert_creds: + if (old_opaque || new_opaque) { + revert_creds(old_cred); + put_cred(override_cred); + } +out_drop_write: + ovl_drop_write(old); +out: + dput(opaquedir); + return err; +} + +const struct inode_operations ovl_dir_inode_operations = { + .lookup = ovl_lookup, + .mkdir = ovl_mkdir, + .symlink = ovl_symlink, + .unlink = ovl_unlink, + .rmdir = ovl_rmdir, + .rename2 = ovl_rename2, + .link = ovl_link, + .setattr = ovl_setattr, + .create = ovl_create, + .mknod = ovl_mknod, + .permission = ovl_permission, + .getattr = ovl_dir_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c new file mode 100644 index 000000000000..af2d18c9fcee --- /dev/null +++ b/fs/overlayfs/inode.c @@ -0,0 +1,425 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include "overlayfs.h" + +static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr, + bool no_data) +{ + int err; + struct dentry *parent; + struct kstat stat; + struct path lowerpath; + + parent = dget_parent(dentry); + err = ovl_copy_up(parent); + if (err) + goto out_dput_parent; + + ovl_path_lower(dentry, &lowerpath); + err = vfs_getattr(&lowerpath, &stat); + if (err) + goto out_dput_parent; + + if (no_data) + stat.size = 0; + + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr); + +out_dput_parent: + dput(parent); + return err; +} + +int ovl_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + struct dentry *upperdentry; + + err = ovl_want_write(dentry); + if (err) + goto out; + + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) { + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr, NULL); + mutex_unlock(&upperdentry->d_inode->i_mutex); + } else { + err = ovl_copy_up_last(dentry, attr, false); + } + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct path realpath; + + ovl_path_real(dentry, &realpath); + return vfs_getattr(&realpath, stat); +} + +int ovl_permission(struct inode *inode, int mask) +{ + struct ovl_entry *oe; + struct dentry *alias = NULL; + struct inode *realinode; + struct dentry *realdentry; + bool is_upper; + int err; + + if (S_ISDIR(inode->i_mode)) { + oe = inode->i_private; + } else if (mask & MAY_NOT_BLOCK) { + return -ECHILD; + } else { + /* + * For non-directories find an alias and get the info + * from there. + */ + alias = d_find_any_alias(inode); + if (WARN_ON(!alias)) + return -ENOENT; + + oe = alias->d_fsdata; + } + + realdentry = ovl_entry_real(oe, &is_upper); + + /* Careful in RCU walk mode */ + realinode = ACCESS_ONCE(realdentry->d_inode); + if (!realinode) { + WARN_ON(!(mask & MAY_NOT_BLOCK)); + err = -ENOENT; + goto out_dput; + } + + if (mask & MAY_WRITE) { + umode_t mode = realinode->i_mode; + + /* + * Writes will always be redirected to upper layer, so + * ignore lower layer being read-only. + * + * If the overlay itself is read-only then proceed + * with the permission check, don't return EROFS. + * This will only happen if this is the lower layer of + * another overlayfs. + * + * If upper fs becomes read-only after the overlay was + * constructed return EROFS to prevent modification of + * upper layer. + */ + err = -EROFS; + if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + goto out_dput; + } + + err = __inode_permission(realinode, mask); +out_dput: + dput(alias); + return err; +} + + +struct ovl_link_data { + struct dentry *realdentry; + void *cookie; +}; + +static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + void *ret; + struct dentry *realdentry; + struct inode *realinode; + + realdentry = ovl_dentry_real(dentry); + realinode = realdentry->d_inode; + + if (WARN_ON(!realinode->i_op->follow_link)) + return ERR_PTR(-EPERM); + + ret = realinode->i_op->follow_link(realdentry, nd); + if (IS_ERR(ret)) + return ret; + + if (realinode->i_op->put_link) { + struct ovl_link_data *data; + + data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); + if (!data) { + realinode->i_op->put_link(realdentry, nd, ret); + return ERR_PTR(-ENOMEM); + } + data->realdentry = realdentry; + data->cookie = ret; + + return data; + } else { + return NULL; + } +} + +static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +{ + struct inode *realinode; + struct ovl_link_data *data = c; + + if (!data) + return; + + realinode = data->realdentry->d_inode; + realinode->i_op->put_link(data->realdentry, nd, data->cookie); + kfree(data); +} + +static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + struct path realpath; + struct inode *realinode; + + ovl_path_real(dentry, &realpath); + realinode = realpath.dentry->d_inode; + + if (!realinode->i_op->readlink) + return -EINVAL; + + touch_atime(&realpath); + + return realinode->i_op->readlink(realpath.dentry, buf, bufsiz); +} + + +static bool ovl_is_private_xattr(const char *name) +{ + return strncmp(name, "trusted.overlay.", 14) == 0; +} + +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct dentry *upperdentry; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = -EPERM; + if (ovl_is_private_xattr(name)) + goto out_drop_write; + + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + upperdentry = ovl_dentry_upper(dentry); + err = vfs_setxattr(upperdentry, name, value, size, flags); + +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + return -ENODATA; + + return vfs_getxattr(ovl_dentry_real(dentry), name, value, size); +} + +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + ssize_t res; + int off; + + res = vfs_listxattr(ovl_dentry_real(dentry), list, size); + if (res <= 0 || size == 0) + return res; + + if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE) + return res; + + /* filter out private xattrs */ + for (off = 0; off < res;) { + char *s = list + off; + size_t slen = strlen(s) + 1; + + BUG_ON(off + slen > res); + + if (ovl_is_private_xattr(s)) { + res -= slen; + memmove(s, s + slen, res - off); + } else { + off += slen; + } + } + + return res; +} + +int ovl_removexattr(struct dentry *dentry, const char *name) +{ + int err; + struct path realpath; + enum ovl_path_type type; + + err = ovl_want_write(dentry); + if (err) + goto out; + + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + goto out_drop_write; + + type = ovl_path_real(dentry, &realpath); + if (type == OVL_PATH_LOWER) { + err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (err < 0) + goto out_drop_write; + + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + ovl_path_upper(dentry, &realpath); + } + + err = vfs_removexattr(realpath.dentry, name); +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, + struct dentry *realdentry) +{ + if (type != OVL_PATH_LOWER) + return false; + + if (special_file(realdentry->d_inode->i_mode)) + return false; + + if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC)) + return false; + + return true; +} + +static int ovl_dentry_open(struct dentry *dentry, struct file *file, + const struct cred *cred) +{ + int err; + struct path realpath; + enum ovl_path_type type; + bool want_write = false; + + type = ovl_path_real(dentry, &realpath); + if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { + want_write = true; + err = ovl_want_write(dentry); + if (err) + goto out; + + if (file->f_flags & O_TRUNC) + err = ovl_copy_up_last(dentry, NULL, true); + else + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + ovl_path_upper(dentry, &realpath); + } + + err = vfs_open(&realpath, file, cred); +out_drop_write: + if (want_write) + ovl_drop_write(dentry); +out: + return err; +} + +static const struct inode_operations ovl_file_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, + .dentry_open = ovl_dentry_open, +}; + +static const struct inode_operations ovl_symlink_inode_operations = { + .setattr = ovl_setattr, + .follow_link = ovl_follow_link, + .put_link = ovl_put_link, + .readlink = ovl_readlink, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe) +{ + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return NULL; + + mode &= S_IFMT; + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_flags |= S_NOATIME | S_NOCMTIME; + + switch (mode) { + case S_IFDIR: + inode->i_private = oe; + inode->i_op = &ovl_dir_inode_operations; + inode->i_fop = &ovl_dir_operations; + break; + + case S_IFLNK: + inode->i_op = &ovl_symlink_inode_operations; + break; + + case S_IFREG: + case S_IFSOCK: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + inode->i_op = &ovl_file_inode_operations; + break; + + default: + WARN(1, "illegal file type: %i\n", mode); + iput(inode); + inode = NULL; + } + + return inode; + +} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h new file mode 100644 index 000000000000..814bed33dd07 --- /dev/null +++ b/fs/overlayfs/overlayfs.h @@ -0,0 +1,191 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include + +struct ovl_entry; + +enum ovl_path_type { + OVL_PATH_PURE_UPPER, + OVL_PATH_UPPER, + OVL_PATH_MERGE, + OVL_PATH_LOWER, +}; + +extern const char *ovl_opaque_xattr; + +static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_rmdir(dir, dentry); + pr_debug("rmdir(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_unlink(dir, dentry, NULL); + pr_debug("unlink(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool debug) +{ + int err = vfs_link(old_dentry, dir, new_dentry, NULL); + if (debug) { + pr_debug("link(%pd2, %pd2) = %i\n", + old_dentry, new_dentry, err); + } + return err; +} + +static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool debug) +{ + int err = vfs_create(dir, dentry, mode, true); + if (debug) + pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode, bool debug) +{ + int err = vfs_mkdir(dir, dentry, mode); + if (debug) + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t dev, bool debug) +{ + int err = vfs_mknod(dir, dentry, mode, dev); + if (debug) { + pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", + dentry, mode, dev, err); + } + return err; +} + +static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, + const char *oldname, bool debug) +{ + int err = vfs_symlink(dir, dentry, oldname); + if (debug) + pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); + return err; +} + +static inline int ovl_do_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err = vfs_setxattr(dentry, name, value, size, flags); + pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n", + dentry, name, (int) size, (char *) value, flags, err); + return err; +} + +static inline int ovl_do_removexattr(struct dentry *dentry, const char *name) +{ + int err = vfs_removexattr(dentry, name); + pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); + return err; +} + +static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, + struct inode *newdir, struct dentry *newdentry, + unsigned int flags) +{ + int err; + + pr_debug("rename2(%pd2, %pd2, 0x%x)\n", + olddentry, newdentry, flags); + + err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); + + if (err) { + pr_debug("...rename2(%pd2, %pd2, ...) = %i\n", + olddentry, newdentry, err); + } + return err; +} + +static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_whiteout(dir, dentry); + pr_debug("whiteout(%pd2) = %i\n", dentry, err); + return err; +} + +enum ovl_path_type ovl_path_type(struct dentry *dentry); +u64 ovl_dentry_version_get(struct dentry *dentry); +void ovl_dentry_version_inc(struct dentry *dentry); +void ovl_path_upper(struct dentry *dentry, struct path *path); +void ovl_path_lower(struct dentry *dentry, struct path *path); +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +struct dentry *ovl_dentry_upper(struct dentry *dentry); +struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct dentry *ovl_dentry_real(struct dentry *dentry); +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper); +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); +struct dentry *ovl_workdir(struct dentry *dentry); +int ovl_want_write(struct dentry *dentry); +void ovl_drop_write(struct dentry *dentry); +bool ovl_dentry_is_opaque(struct dentry *dentry); +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); +bool ovl_is_whiteout(struct dentry *dentry); +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); +struct file *ovl_path_open(struct path *path, int flags); + +struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry, + struct kstat *stat, const char *link); + +/* readdir.c */ +extern const struct file_operations ovl_dir_operations; +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); +void ovl_cache_free(struct list_head *list); + +/* inode.c */ +int ovl_setattr(struct dentry *dentry, struct iattr *attr); +int ovl_permission(struct inode *inode, int mask); +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); +int ovl_removexattr(struct dentry *dentry, const char *name); + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe); +static inline void ovl_copyattr(struct inode *from, struct inode *to) +{ + to->i_uid = from->i_uid; + to->i_gid = from->i_gid; +} + +/* dir.c */ +extern const struct inode_operations ovl_dir_inode_operations; +struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry); +int ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct kstat *stat, const char *link, + struct dentry *hardlink, bool debug); +void ovl_cleanup(struct inode *dir, struct dentry *dentry); + +/* copy_up.c */ +int ovl_copy_up(struct dentry *dentry); +int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + struct iattr *attr); +int ovl_copy_xattr(struct dentry *old, struct dentry *new); +int ovl_set_attr(struct dentry *upper, struct kstat *stat); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c new file mode 100644 index 000000000000..c6787f84ece9 --- /dev/null +++ b/fs/overlayfs/readdir.c @@ -0,0 +1,587 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +struct ovl_cache_entry { + const char *name; + unsigned int len; + unsigned int type; + u64 ino; + bool is_whiteout; + struct list_head l_node; + struct rb_node node; +}; + +struct ovl_dir_cache { + long refcount; + u64 version; + struct list_head entries; +}; + +struct ovl_readdir_data { + struct dir_context ctx; + bool is_merge; + struct rb_root *root; + struct list_head *list; + struct list_head *middle; + int count; + int err; +}; + +struct ovl_dir_file { + bool is_real; + bool is_upper; + struct ovl_dir_cache *cache; + struct ovl_cache_entry cursor; + struct file *realfile; + struct file *upperfile; +}; + +static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) +{ + return container_of(n, struct ovl_cache_entry, node); +} + +static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, + const char *name, int len) +{ + struct rb_node *node = root->rb_node; + int cmp; + + while (node) { + struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); + + cmp = strncmp(name, p->name, len); + if (cmp > 0) + node = p->node.rb_right; + else if (cmp < 0 || len < p->len) + node = p->node.rb_left; + else + return p; + } + + return NULL; +} + +static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, + u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + + p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL); + if (p) { + char *name_copy = (char *) (p + 1); + memcpy(name_copy, name, len); + name_copy[len] = '\0'; + p->name = name_copy; + p->len = len; + p->type = d_type; + p->ino = ino; + p->is_whiteout = false; + } + + return p; +} + +static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, + const char *name, int len, u64 ino, + unsigned int d_type) +{ + struct rb_node **newp = &rdd->root->rb_node; + struct rb_node *parent = NULL; + struct ovl_cache_entry *p; + + while (*newp) { + int cmp; + struct ovl_cache_entry *tmp; + + parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + return 0; + } + + p = ovl_cache_entry_new(name, len, ino, d_type); + if (p == NULL) + return -ENOMEM; + + list_add_tail(&p->l_node, rdd->list); + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, rdd->root); + + return 0; +} + +static int ovl_fill_lower(struct ovl_readdir_data *rdd, + const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(rdd->root, name, namelen); + if (p) { + list_move_tail(&p->l_node, rdd->middle); + } else { + p = ovl_cache_entry_new(name, namelen, ino, d_type); + if (p == NULL) + rdd->err = -ENOMEM; + else + list_add_tail(&p->l_node, rdd->middle); + } + + return rdd->err; +} + +void ovl_cache_free(struct list_head *list) +{ + struct ovl_cache_entry *p; + struct ovl_cache_entry *n; + + list_for_each_entry_safe(p, n, list, l_node) + kfree(p); + + INIT_LIST_HEAD(list); +} + +static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) +{ + struct ovl_dir_cache *cache = od->cache; + + list_del(&od->cursor.l_node); + WARN_ON(cache->refcount <= 0); + cache->refcount--; + if (!cache->refcount) { + if (ovl_dir_cache(dentry) == cache) + ovl_set_dir_cache(dentry, NULL); + + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + +static int ovl_fill_merge(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_readdir_data *rdd = buf; + + rdd->count++; + if (!rdd->is_merge) + return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); + else + return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type); +} + +static inline int ovl_dir_read(struct path *realpath, + struct ovl_readdir_data *rdd) +{ + struct file *realfile; + int err; + + realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + rdd->ctx.pos = 0; + do { + rdd->count = 0; + rdd->err = 0; + err = iterate_dir(realfile, &rdd->ctx); + if (err >= 0) + err = rdd->err; + } while (!err && rdd->count); + fput(realfile); + + return err; +} + +static void ovl_dir_reset(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + struct ovl_dir_cache *cache = od->cache; + struct dentry *dentry = file->f_path.dentry; + enum ovl_path_type type = ovl_path_type(dentry); + + if (cache && ovl_dentry_version_get(dentry) != cache->version) { + ovl_cache_put(od, dentry); + od->cache = NULL; + } + WARN_ON(!od->is_real && type != OVL_PATH_MERGE); + if (od->is_real && type == OVL_PATH_MERGE) + od->is_real = false; +} + +static int ovl_dir_mark_whiteouts(struct dentry *dir, + struct ovl_readdir_data *rdd) +{ + struct ovl_cache_entry *p; + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) { + ovl_cache_free(rdd->list); + return -ENOMEM; + } + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + mutex_lock(&dir->d_inode->i_mutex); + list_for_each_entry(p, rdd->list, l_node) { + if (!p->name) + continue; + + if (p->type != DT_CHR) + continue; + + dentry = lookup_one_len(p->name, dir, p->len); + if (IS_ERR(dentry)) + continue; + + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + mutex_unlock(&dir->d_inode->i_mutex); + + revert_creds(old_cred); + put_cred(override_cred); + + return 0; +} + +static inline int ovl_dir_read_merged(struct path *upperpath, + struct path *lowerpath, + struct list_head *list) +{ + int err; + struct rb_root root = RB_ROOT; + struct list_head middle; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .list = list, + .root = &root, + .is_merge = false, + }; + + if (upperpath->dentry) { + err = ovl_dir_read(upperpath, &rdd); + if (err) + goto out; + + if (lowerpath->dentry) { + err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd); + if (err) + goto out; + } + } + if (lowerpath->dentry) { + /* + * Insert lowerpath entries before upperpath ones, this allows + * offsets to be reasonably constant + */ + list_add(&middle, rdd.list); + rdd.middle = &middle; + rdd.is_merge = true; + err = ovl_dir_read(lowerpath, &rdd); + list_del(&middle); + } +out: + return err; + +} + +static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) +{ + struct ovl_cache_entry *p; + loff_t off = 0; + + list_for_each_entry(p, &od->cache->entries, l_node) { + if (!p->name) + continue; + if (off >= pos) + break; + off++; + } + list_move_tail(&od->cursor.l_node, &p->l_node); +} + +static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) +{ + int res; + struct path lowerpath; + struct path upperpath; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(dentry); + if (cache && ovl_dentry_version_get(dentry) == cache->version) { + cache->refcount++; + return cache; + } + ovl_set_dir_cache(dentry, NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + cache->refcount = 1; + INIT_LIST_HEAD(&cache->entries); + + ovl_path_lower(dentry, &lowerpath); + ovl_path_upper(dentry, &upperpath); + + res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(dentry, cache); + + return cache; +} + +static int ovl_iterate(struct file *file, struct dir_context *ctx) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + + if (!ctx->pos) + ovl_dir_reset(file); + + if (od->is_real) + return iterate_dir(od->realfile, ctx); + + if (!od->cache) { + struct ovl_dir_cache *cache; + + cache = ovl_cache_get(dentry); + if (IS_ERR(cache)) + return PTR_ERR(cache); + + od->cache = cache; + ovl_seek_cursor(od, ctx->pos); + } + + while (od->cursor.l_node.next != &od->cache->entries) { + struct ovl_cache_entry *p; + + p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node); + /* Skip cursors */ + if (p->name) { + if (!p->is_whiteout) { + if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) + break; + } + ctx->pos++; + } + list_move(&od->cursor.l_node, &p->l_node); + } + return 0; +} + +static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t res; + struct ovl_dir_file *od = file->private_data; + + mutex_lock(&file_inode(file)->i_mutex); + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_llseek(od->realfile, offset, origin); + file->f_pos = od->realfile->f_pos; + } else { + res = -EINVAL; + + switch (origin) { + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_SET: + break; + default: + goto out_unlock; + } + if (offset < 0) + goto out_unlock; + + if (offset != file->f_pos) { + file->f_pos = offset; + if (od->cache) + ovl_seek_cursor(od, offset); + } + res = offset; + } +out_unlock: + mutex_unlock(&file_inode(file)->i_mutex); + + return res; +} + +static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct file *realfile = od->realfile; + + /* + * Need to check if we started out being a lower dir, but got copied up + */ + if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) { + struct inode *inode = file_inode(file); + + mutex_lock(&inode->i_mutex); + realfile = od->upperfile; + if (!realfile) { + struct path upperpath; + + ovl_path_upper(dentry, &upperpath); + realfile = ovl_path_open(&upperpath, O_RDONLY); + if (IS_ERR(realfile)) { + mutex_unlock(&inode->i_mutex); + return PTR_ERR(realfile); + } + od->upperfile = realfile; + } + mutex_unlock(&inode->i_mutex); + } + + return vfs_fsync_range(realfile, start, end, datasync); +} + +static int ovl_dir_release(struct inode *inode, struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + + if (od->cache) { + mutex_lock(&inode->i_mutex); + ovl_cache_put(od, file->f_path.dentry); + mutex_unlock(&inode->i_mutex); + } + fput(od->realfile); + if (od->upperfile) + fput(od->upperfile); + kfree(od); + + return 0; +} + +static int ovl_dir_open(struct inode *inode, struct file *file) +{ + struct path realpath; + struct file *realfile; + struct ovl_dir_file *od; + enum ovl_path_type type; + + od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); + if (!od) + return -ENOMEM; + + type = ovl_path_real(file->f_path.dentry, &realpath); + realfile = ovl_path_open(&realpath, file->f_flags); + if (IS_ERR(realfile)) { + kfree(od); + return PTR_ERR(realfile); + } + INIT_LIST_HEAD(&od->cursor.l_node); + od->realfile = realfile; + od->is_real = (type != OVL_PATH_MERGE); + od->is_upper = (type != OVL_PATH_LOWER); + file->private_data = od; + + return 0; +} + +const struct file_operations ovl_dir_operations = { + .read = generic_read_dir, + .open = ovl_dir_open, + .iterate = ovl_iterate, + .llseek = ovl_dir_llseek, + .fsync = ovl_dir_fsync, + .release = ovl_dir_release, +}; + +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) +{ + int err; + struct path lowerpath; + struct path upperpath; + struct ovl_cache_entry *p; + + ovl_path_upper(dentry, &upperpath); + ovl_path_lower(dentry, &lowerpath); + + err = ovl_dir_read_merged(&upperpath, &lowerpath, list); + if (err) + return err; + + err = 0; + + list_for_each_entry(p, list, l_node) { + if (p->is_whiteout) + continue; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + err = -ENOTEMPTY; + break; + } + + return err; +} + +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) +{ + struct ovl_cache_entry *p; + + mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT); + list_for_each_entry(p, list, l_node) { + struct dentry *dentry; + + if (!p->is_whiteout) + continue; + + dentry = lookup_one_len(p->name, upper, p->len); + if (IS_ERR(dentry)) { + pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n", + upper->d_name.name, p->len, p->name, + (int) PTR_ERR(dentry)); + continue; + } + ovl_cleanup(upper->d_inode, dentry); + dput(dentry); + } + mutex_unlock(&upper->d_inode->i_mutex); +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c new file mode 100644 index 000000000000..227710aad781 --- /dev/null +++ b/fs/overlayfs/super.c @@ -0,0 +1,727 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "overlayfs.h" + +MODULE_AUTHOR("Miklos Szeredi "); +MODULE_DESCRIPTION("Overlay filesystem"); +MODULE_LICENSE("GPL"); + +/* private information held for overlayfs's superblock */ +struct ovl_fs { + struct vfsmount *upper_mnt; + struct vfsmount *lower_mnt; + struct dentry *workdir; +}; + +struct ovl_dir_cache; + +/* private information held for every overlayfs dentry */ +struct ovl_entry { + struct dentry *__upperdentry; + struct dentry *lowerdentry; + struct ovl_dir_cache *cache; + union { + struct { + u64 version; + bool opaque; + }; + struct rcu_head rcu; + }; +}; + +const char *ovl_opaque_xattr = "trusted.overlay.opaque"; + + +enum ovl_path_type ovl_path_type(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->__upperdentry) { + if (oe->lowerdentry) { + if (S_ISDIR(dentry->d_inode->i_mode)) + return OVL_PATH_MERGE; + else + return OVL_PATH_UPPER; + } else { + if (oe->opaque) + return OVL_PATH_UPPER; + else + return OVL_PATH_PURE_UPPER; + } + } else { + return OVL_PATH_LOWER; + } +} + +static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) +{ + struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); + /* + * Make sure to order reads to upperdentry wrt ovl_dentry_update() + */ + smp_read_barrier_depends(); + return upperdentry; +} + +void ovl_path_upper(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->upper_mnt; + path->dentry = ovl_upperdentry_dereference(oe); +} + +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) +{ + + enum ovl_path_type type = ovl_path_type(dentry); + + if (type == OVL_PATH_LOWER) + ovl_path_lower(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +struct dentry *ovl_dentry_upper(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return ovl_upperdentry_dereference(oe); +} + +struct dentry *ovl_dentry_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->lowerdentry; +} + +struct dentry *ovl_dentry_real(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (!realdentry) + realdentry = oe->lowerdentry; + + return realdentry; +} + +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) +{ + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (realdentry) { + *is_upper = true; + } else { + realdentry = oe->lowerdentry; + *is_upper = false; + } + return realdentry; +} + +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->cache; +} + +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + oe->cache = cache; +} + +void ovl_path_lower(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->lower_mnt; + path->dentry = oe->lowerdentry; +} + +int ovl_want_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return mnt_want_write(ofs->upper_mnt); +} + +void ovl_drop_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + mnt_drop_write(ofs->upper_mnt); +} + +struct dentry *ovl_workdir(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return ofs->workdir; +} + +bool ovl_dentry_is_opaque(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + return oe->opaque; +} + +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque) +{ + struct ovl_entry *oe = dentry->d_fsdata; + oe->opaque = opaque; +} + +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(oe->__upperdentry); + BUG_ON(!upperdentry->d_inode); + /* + * Make sure upperdentry is consistent before making it visible to + * ovl_upperdentry_dereference(). + */ + smp_wmb(); + oe->__upperdentry = upperdentry; +} + +void ovl_dentry_version_inc(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + oe->version++; +} + +u64 ovl_dentry_version_get(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + return oe->version; +} + +bool ovl_is_whiteout(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + return inode && IS_WHITEOUT(inode); +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + int res; + char val; + struct inode *inode = dentry->d_inode; + + if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr) + return false; + + res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +static void ovl_dentry_release(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe) { + dput(oe->__upperdentry); + dput(oe->lowerdentry); + kfree_rcu(oe, rcu); + } +} + +static const struct dentry_operations ovl_dentry_operations = { + .d_release = ovl_dentry_release, +}; + +static struct ovl_entry *ovl_alloc_entry(void) +{ + return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); +} + +static inline struct dentry *ovl_lookup_real(struct dentry *dir, + struct qstr *name) +{ + struct dentry *dentry; + + mutex_lock(&dir->d_inode->i_mutex); + dentry = lookup_one_len(name->name, dir, name->len); + mutex_unlock(&dir->d_inode->i_mutex); + + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) == -ENOENT) + dentry = NULL; + } else if (!dentry->d_inode) { + dput(dentry); + dentry = NULL; + } + return dentry; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ovl_entry *oe; + struct dentry *upperdir; + struct dentry *lowerdir; + struct dentry *upperdentry = NULL; + struct dentry *lowerdentry = NULL; + struct inode *inode = NULL; + int err; + + err = -ENOMEM; + oe = ovl_alloc_entry(); + if (!oe) + goto out; + + upperdir = ovl_dentry_upper(dentry->d_parent); + lowerdir = ovl_dentry_lower(dentry->d_parent); + + if (upperdir) { + upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); + err = PTR_ERR(upperdentry); + if (IS_ERR(upperdentry)) + goto out_put_dir; + + if (lowerdir && upperdentry) { + if (ovl_is_whiteout(upperdentry)) { + dput(upperdentry); + upperdentry = NULL; + oe->opaque = true; + } else if (ovl_is_opaquedir(upperdentry)) { + oe->opaque = true; + } + } + } + if (lowerdir && !oe->opaque) { + lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); + err = PTR_ERR(lowerdentry); + if (IS_ERR(lowerdentry)) + goto out_dput_upper; + } + + if (lowerdentry && upperdentry && + (!S_ISDIR(upperdentry->d_inode->i_mode) || + !S_ISDIR(lowerdentry->d_inode->i_mode))) { + dput(lowerdentry); + lowerdentry = NULL; + oe->opaque = true; + } + + if (lowerdentry || upperdentry) { + struct dentry *realdentry; + + realdentry = upperdentry ? upperdentry : lowerdentry; + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, + oe); + if (!inode) + goto out_dput; + ovl_copyattr(realdentry->d_inode, inode); + } + + oe->__upperdentry = upperdentry; + oe->lowerdentry = lowerdentry; + + dentry->d_fsdata = oe; + d_add(dentry, inode); + + return NULL; + +out_dput: + dput(lowerdentry); +out_dput_upper: + dput(upperdentry); +out_put_dir: + kfree(oe); +out: + return ERR_PTR(err); +} + +struct file *ovl_path_open(struct path *path, int flags) +{ + return dentry_open(path, flags, current_cred()); +} + +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ufs = sb->s_fs_info; + + dput(ufs->workdir); + mntput(ufs->upper_mnt); + mntput(ufs->lower_mnt); + + kfree(ufs); +} + +static const struct super_operations ovl_super_operations = { + .put_super = ovl_put_super, +}; + +struct ovl_config { + char *lowerdir; + char *upperdir; + char *workdir; +}; + +enum { + OPT_LOWERDIR, + OPT_UPPERDIR, + OPT_WORKDIR, + OPT_ERR, +}; + +static const match_table_t ovl_tokens = { + {OPT_LOWERDIR, "lowerdir=%s"}, + {OPT_UPPERDIR, "upperdir=%s"}, + {OPT_WORKDIR, "workdir=%s"}, + {OPT_ERR, NULL} +}; + +static int ovl_parse_opt(char *opt, struct ovl_config *config) +{ + char *p; + + config->upperdir = NULL; + config->lowerdir = NULL; + config->workdir = NULL; + + while ((p = strsep(&opt, ",")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, ovl_tokens, args); + switch (token) { + case OPT_UPPERDIR: + kfree(config->upperdir); + config->upperdir = match_strdup(&args[0]); + if (!config->upperdir) + return -ENOMEM; + break; + + case OPT_LOWERDIR: + kfree(config->lowerdir); + config->lowerdir = match_strdup(&args[0]); + if (!config->lowerdir) + return -ENOMEM; + break; + + case OPT_WORKDIR: + kfree(config->workdir); + config->workdir = match_strdup(&args[0]); + if (!config->workdir) + return -ENOMEM; + break; + + default: + return -EINVAL; + } + } + return 0; +} + +#define OVL_WORKDIR_NAME "work" + +static struct dentry *ovl_workdir_create(struct vfsmount *mnt, + struct dentry *dentry) +{ + struct inode *dir = dentry->d_inode; + struct dentry *work; + int err; + bool retried = false; + + err = mnt_want_write(mnt); + if (err) + return ERR_PTR(err); + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); +retry: + work = lookup_one_len(OVL_WORKDIR_NAME, dentry, + strlen(OVL_WORKDIR_NAME)); + + if (!IS_ERR(work)) { + struct kstat stat = { + .mode = S_IFDIR | 0, + }; + + if (work->d_inode) { + err = -EEXIST; + if (retried) + goto out_dput; + + retried = true; + ovl_cleanup(dir, work); + dput(work); + goto retry; + } + + err = ovl_create_real(dir, work, &stat, NULL, NULL, true); + if (err) + goto out_dput; + } +out_unlock: + mutex_unlock(&dir->i_mutex); + mnt_drop_write(mnt); + + return work; + +out_dput: + dput(work); + work = ERR_PTR(err); + goto out_unlock; +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err; + + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + err = -EINVAL; + } + return err; +} + +static bool ovl_is_allowed_fs_type(struct dentry *root) +{ + const struct dentry_operations *dop = root->d_op; + + /* + * We don't support: + * - automount filesystems + * - filesystems with revalidate (FIXME for lower layer) + * - filesystems with case insensitive names + */ + if (dop && + (dop->d_manage || dop->d_automount || + dop->d_revalidate || dop->d_weak_revalidate || + dop->d_compare || dop->d_hash)) { + return false; + } + return true; +} + +/* Workdir should not be subdir of upperdir and vice versa */ +static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) +{ + bool ok = false; + + if (workdir != upperdir) { + ok = (lock_rename(workdir, upperdir) == NULL); + unlock_rename(workdir, upperdir); + } + return ok; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path lowerpath; + struct path upperpath; + struct path workpath; + struct inode *root_inode; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ufs; + struct ovl_config config; + int err; + + err = ovl_parse_opt((char *) data, &config); + if (err) + goto out; + + /* FIXME: workdir is not needed for a R/O mount */ + err = -EINVAL; + if (!config.upperdir || !config.lowerdir || !config.workdir) { + pr_err("overlayfs: missing upperdir or lowerdir or workdir\n"); + goto out_free_config; + } + + err = -ENOMEM; + ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ufs) + goto out_free_config; + + oe = ovl_alloc_entry(); + if (oe == NULL) + goto out_free_ufs; + + err = ovl_mount_dir(config.upperdir, &upperpath); + if (err) + goto out_free_oe; + + err = ovl_mount_dir(config.lowerdir, &lowerpath); + if (err) + goto out_put_upperpath; + + err = ovl_mount_dir(config.workdir, &workpath); + if (err) + goto out_put_lowerpath; + + err = -EINVAL; + if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) || + !S_ISDIR(lowerpath.dentry->d_inode->i_mode) || + !S_ISDIR(workpath.dentry->d_inode->i_mode)) { + pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n"); + goto out_put_workpath; + } + + if (upperpath.mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out_put_workpath; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out_put_workpath; + } + + if (!ovl_is_allowed_fs_type(upperpath.dentry)) { + pr_err("overlayfs: filesystem of upperdir is not supported\n"); + goto out_put_workpath; + } + + if (!ovl_is_allowed_fs_type(lowerpath.dentry)) { + pr_err("overlayfs: filesystem of lowerdir is not supported\n"); + goto out_put_workpath; + } + + ufs->upper_mnt = clone_private_mount(&upperpath); + err = PTR_ERR(ufs->upper_mnt); + if (IS_ERR(ufs->upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out_put_workpath; + } + + ufs->lower_mnt = clone_private_mount(&lowerpath); + err = PTR_ERR(ufs->lower_mnt); + if (IS_ERR(ufs->lower_mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + goto out_put_upper_mnt; + } + + ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); + err = PTR_ERR(ufs->workdir); + if (IS_ERR(ufs->workdir)) { + pr_err("overlayfs: failed to create directory %s/%s\n", + config.workdir, OVL_WORKDIR_NAME); + goto out_put_lower_mnt; + } + + /* + * Make lower_mnt R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + ufs->lower_mnt->mnt_flags |= MNT_READONLY; + + /* If the upper fs is r/o, we mark overlayfs r/o too */ + if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) + sb->s_flags |= MS_RDONLY; + + sb->s_d_op = &ovl_dentry_operations; + + err = -ENOMEM; + root_inode = ovl_new_inode(sb, S_IFDIR, oe); + if (!root_inode) + goto out_put_workdir; + + root_dentry = d_make_root(root_inode); + if (!root_dentry) + goto out_put_workdir; + + mntput(upperpath.mnt); + mntput(lowerpath.mnt); + path_put(&workpath); + + oe->__upperdentry = upperpath.dentry; + oe->lowerdentry = lowerpath.dentry; + + root_dentry->d_fsdata = oe; + + sb->s_op = &ovl_super_operations; + sb->s_root = root_dentry; + sb->s_fs_info = ufs; + + return 0; + +out_put_workdir: + dput(ufs->workdir); +out_put_lower_mnt: + mntput(ufs->lower_mnt); +out_put_upper_mnt: + mntput(ufs->upper_mnt); +out_put_workpath: + path_put(&workpath); +out_put_lowerpath: + path_put(&lowerpath); +out_put_upperpath: + path_put(&upperpath); +out_free_oe: + kfree(oe); +out_free_ufs: + kfree(ufs); +out_free_config: + kfree(config.lowerdir); + kfree(config.upperdir); + kfree(config.workdir); +out: + return err; +} + +static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, ovl_fill_super); +} + +static struct file_system_type ovl_fs_type = { + .owner = THIS_MODULE, + .name = "overlayfs", + .mount = ovl_mount, + .kill_sb = kill_anon_super, +}; +MODULE_ALIAS_FS("overlayfs"); + +static int __init ovl_init(void) +{ + return register_filesystem(&ovl_fs_type); +} + +static void __exit ovl_exit(void) +{ + unregister_filesystem(&ovl_fs_type); +} + +module_init(ovl_init); +module_exit(ovl_exit); -- cgit v1.2.3 From cc2596392af3b1404421aaef828a255303c46f93 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Fri, 24 Oct 2014 00:14:38 +0200 Subject: overlayfs: add statfs support Add support for statfs to the overlayfs filesystem. As the upper layer is the target of all write operations assume that the space in that filesystem is the space in the overlayfs. There will be some inaccuracy as overwriting a file will copy it up and consume space we were not expecting, but it is better than nothing. Use the upper layer dentry and mount from the overlayfs root inode, passing the statfs call to that filesystem. Signed-off-by: Andy Whitcroft Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 227710aad781..aaf562b9f937 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -16,17 +16,21 @@ #include #include #include +#include #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); +#define OVERLAYFS_SUPER_MAGIC 0x794c764f + /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; struct vfsmount *lower_mnt; struct dentry *workdir; + long lower_namelen; }; struct ovl_dir_cache; @@ -383,8 +387,35 @@ static void ovl_put_super(struct super_block *sb) kfree(ufs); } +/** + * ovl_statfs + * @sb: The overlayfs super block + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. As writes always target the upper layer + * filesystem pass the statfs to the same filesystem. + */ +static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct dentry *root_dentry = dentry->d_sb->s_root; + struct path path; + int err; + + ovl_path_upper(root_dentry, &path); + + err = vfs_statfs(&path, buf); + if (!err) { + buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen); + buf->f_type = OVERLAYFS_SUPER_MAGIC; + } + + return err; +} + static const struct super_operations ovl_super_operations = { .put_super = ovl_put_super, + .statfs = ovl_statfs, }; struct ovl_config { @@ -556,6 +587,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct ovl_entry *oe; struct ovl_fs *ufs; struct ovl_config config; + struct kstatfs statfs; int err; err = ovl_parse_opt((char *) data, &config); @@ -617,6 +649,13 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_put_workpath; } + err = vfs_statfs(&lowerpath, &statfs); + if (err) { + pr_err("overlayfs: statfs failed on lowerpath\n"); + goto out_put_workpath; + } + ufs->lower_namelen = statfs.f_namelen; + ufs->upper_mnt = clone_private_mount(&upperpath); err = PTR_ERR(ufs->upper_mnt); if (IS_ERR(ufs->upper_mnt)) { @@ -669,6 +708,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) root_dentry->d_fsdata = oe; + sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_op = &ovl_super_operations; sb->s_root = root_dentry; sb->s_fs_info = ufs; -- cgit v1.2.3 From f45827e84186af152492c6d0dcf4105b4a605f9b Mon Sep 17 00:00:00 2001 From: Erez Zadok Date: Fri, 24 Oct 2014 00:14:38 +0200 Subject: overlayfs: implement show_options This is useful because of the stacking nature of overlayfs. Users like to find out (via /proc/mounts) which lower/upper directory were used at mount time. AV: even failing ovl_parse_opt() could've done some kstrdup() AV: failure of ovl_alloc_entry() should end up with ENOMEM, not EINVAL Signed-off-by: Erez Zadok Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 76 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index aaf562b9f937..7dcc24e84417 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "overlayfs.h" MODULE_AUTHOR("Miklos Szeredi "); @@ -25,12 +26,20 @@ MODULE_LICENSE("GPL"); #define OVERLAYFS_SUPER_MAGIC 0x794c764f +struct ovl_config { + char *lowerdir; + char *upperdir; + char *workdir; +}; + /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; struct vfsmount *lower_mnt; struct dentry *workdir; long lower_namelen; + /* pathnames of lower and upper dirs, for show_options */ + struct ovl_config config; }; struct ovl_dir_cache; @@ -384,6 +393,9 @@ static void ovl_put_super(struct super_block *sb) mntput(ufs->upper_mnt); mntput(ufs->lower_mnt); + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); + kfree(ufs->config.workdir); kfree(ufs); } @@ -413,15 +425,27 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } +/** + * ovl_show_options + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +static int ovl_show_options(struct seq_file *m, struct dentry *dentry) +{ + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ufs = sb->s_fs_info; + + seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); + seq_printf(m, ",upperdir=%s", ufs->config.upperdir); + seq_printf(m, ",workdir=%s", ufs->config.workdir); + return 0; +} + static const struct super_operations ovl_super_operations = { .put_super = ovl_put_super, .statfs = ovl_statfs, -}; - -struct ovl_config { - char *lowerdir; - char *upperdir; - char *workdir; + .show_options = ovl_show_options, }; enum { @@ -442,10 +466,6 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) { char *p; - config->upperdir = NULL; - config->lowerdir = NULL; - config->workdir = NULL; - while ((p = strsep(&opt, ",")) != NULL) { int token; substring_t args[MAX_OPT_ARGS]; @@ -586,39 +606,40 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_fs *ufs; - struct ovl_config config; struct kstatfs statfs; int err; - err = ovl_parse_opt((char *) data, &config); - if (err) + err = -ENOMEM; + ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ufs) goto out; + err = ovl_parse_opt((char *) data, &ufs->config); + if (err) + goto out_free_config; + /* FIXME: workdir is not needed for a R/O mount */ err = -EINVAL; - if (!config.upperdir || !config.lowerdir || !config.workdir) { + if (!ufs->config.upperdir || !ufs->config.lowerdir || + !ufs->config.workdir) { pr_err("overlayfs: missing upperdir or lowerdir or workdir\n"); goto out_free_config; } err = -ENOMEM; - ufs = kmalloc(sizeof(struct ovl_fs), GFP_KERNEL); - if (!ufs) - goto out_free_config; - oe = ovl_alloc_entry(); if (oe == NULL) - goto out_free_ufs; + goto out_free_config; - err = ovl_mount_dir(config.upperdir, &upperpath); + err = ovl_mount_dir(ufs->config.upperdir, &upperpath); if (err) goto out_free_oe; - err = ovl_mount_dir(config.lowerdir, &lowerpath); + err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath); if (err) goto out_put_upperpath; - err = ovl_mount_dir(config.workdir, &workpath); + err = ovl_mount_dir(ufs->config.workdir, &workpath); if (err) goto out_put_lowerpath; @@ -674,7 +695,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) err = PTR_ERR(ufs->workdir); if (IS_ERR(ufs->workdir)) { pr_err("overlayfs: failed to create directory %s/%s\n", - config.workdir, OVL_WORKDIR_NAME); + ufs->config.workdir, OVL_WORKDIR_NAME); goto out_put_lower_mnt; } @@ -729,12 +750,11 @@ out_put_upperpath: path_put(&upperpath); out_free_oe: kfree(oe); -out_free_ufs: - kfree(ufs); out_free_config: - kfree(config.lowerdir); - kfree(config.upperdir); - kfree(config.workdir); + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); + kfree(ufs->config.workdir); + kfree(ufs); out: return err; } -- cgit v1.2.3 From 7c37fbda85ceb9be7bdb9d5f53e702efc40cf783 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Fri, 24 Oct 2014 00:14:39 +0200 Subject: overlay: overlay filesystem documentation Document the overlay filesystem. Signed-off-by: Miklos Szeredi --- Documentation/filesystems/overlayfs.txt | 198 ++++++++++++++++++++++++++++++++ MAINTAINERS | 7 ++ 2 files changed, 205 insertions(+) create mode 100644 Documentation/filesystems/overlayfs.txt diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt new file mode 100644 index 000000000000..530850a72735 --- /dev/null +++ b/Documentation/filesystems/overlayfs.txt @@ -0,0 +1,198 @@ +Written by: Neil Brown + +Overlay Filesystem +================== + +This document describes a prototype for a new approach to providing +overlay-filesystem functionality in Linux (sometimes referred to as +union-filesystems). An overlay-filesystem tries to present a +filesystem which is the result over overlaying one filesystem on top +of the other. + +The result will inevitably fail to look exactly like a normal +filesystem for various technical reasons. The expectation is that +many use cases will be able to ignore these differences. + +This approach is 'hybrid' because the objects that appear in the +filesystem do not all appear to belong to that filesystem. In many +cases an object accessed in the union will be indistinguishable +from accessing the corresponding object from the original filesystem. +This is most obvious from the 'st_dev' field returned by stat(2). + +While directories will report an st_dev from the overlay-filesystem, +all non-directory objects will report an st_dev from the lower or +upper filesystem that is providing the object. Similarly st_ino will +only be unique when combined with st_dev, and both of these can change +over the lifetime of a non-directory object. Many applications and +tools ignore these values and will not be affected. + +Upper and Lower +--------------- + +An overlay filesystem combines two filesystems - an 'upper' filesystem +and a 'lower' filesystem. When a name exists in both filesystems, the +object in the 'upper' filesystem is visible while the object in the +'lower' filesystem is either hidden or, in the case of directories, +merged with the 'upper' object. + +It would be more correct to refer to an upper and lower 'directory +tree' rather than 'filesystem' as it is quite possible for both +directory trees to be in the same filesystem and there is no +requirement that the root of a filesystem be given for either upper or +lower. + +The lower filesystem can be any filesystem supported by Linux and does +not need to be writable. The lower filesystem can even be another +overlayfs. The upper filesystem will normally be writable and if it +is it must support the creation of trusted.* extended attributes, and +must provide valid d_type in readdir responses, so NFS is not suitable. + +A read-only overlay of two read-only filesystems may use any +filesystem type. + +Directories +----------- + +Overlaying mainly involves directories. If a given name appears in both +upper and lower filesystems and refers to a non-directory in either, +then the lower object is hidden - the name refers only to the upper +object. + +Where both upper and lower objects are directories, a merged directory +is formed. + +At mount time, the two directories given as mount options "lowerdir" and +"upperdir" are combined into a merged directory: + + mount -t overlayfs overlayfs -olowerdir=/lower,upperdir=/upper,\ +workdir=/work /merged + +The "workdir" needs to be an empty directory on the same filesystem +as upperdir. + +Then whenever a lookup is requested in such a merged directory, the +lookup is performed in each actual directory and the combined result +is cached in the dentry belonging to the overlay filesystem. If both +actual lookups find directories, both are stored and a merged +directory is created, otherwise only one is stored: the upper if it +exists, else the lower. + +Only the lists of names from directories are merged. Other content +such as metadata and extended attributes are reported for the upper +directory only. These attributes of the lower directory are hidden. + +whiteouts and opaque directories +-------------------------------- + +In order to support rm and rmdir without changing the lower +filesystem, an overlay filesystem needs to record in the upper filesystem +that files have been removed. This is done using whiteouts and opaque +directories (non-directories are always opaque). + +A whiteout is created as a character device with 0/0 device number. +When a whiteout is found in the upper level of a merged directory, any +matching name in the lower level is ignored, and the whiteout itself +is also hidden. + +A directory is made opaque by setting the xattr "trusted.overlay.opaque" +to "y". Where the upper filesystem contains an opaque directory, any +directory in the lower filesystem with the same name is ignored. + +readdir +------- + +When a 'readdir' request is made on a merged directory, the upper and +lower directories are each read and the name lists merged in the +obvious way (upper is read first, then lower - entries that already +exist are not re-added). This merged name list is cached in the +'struct file' and so remains as long as the file is kept open. If the +directory is opened and read by two processes at the same time, they +will each have separate caches. A seekdir to the start of the +directory (offset 0) followed by a readdir will cause the cache to be +discarded and rebuilt. + +This means that changes to the merged directory do not appear while a +directory is being read. This is unlikely to be noticed by many +programs. + +seek offsets are assigned sequentially when the directories are read. +Thus if + - read part of a directory + - remember an offset, and close the directory + - re-open the directory some time later + - seek to the remembered offset + +there may be little correlation between the old and new locations in +the list of filenames, particularly if anything has changed in the +directory. + +Readdir on directories that are not merged is simply handled by the +underlying directory (upper or lower). + + +Non-directories +--------------- + +Objects that are not directories (files, symlinks, device-special +files etc.) are presented either from the upper or lower filesystem as +appropriate. When a file in the lower filesystem is accessed in a way +the requires write-access, such as opening for write access, changing +some metadata etc., the file is first copied from the lower filesystem +to the upper filesystem (copy_up). Note that creating a hard-link +also requires copy_up, though of course creation of a symlink does +not. + +The copy_up may turn out to be unnecessary, for example if the file is +opened for read-write but the data is not modified. + +The copy_up process first makes sure that the containing directory +exists in the upper filesystem - creating it and any parents as +necessary. It then creates the object with the same metadata (owner, +mode, mtime, symlink-target etc.) and then if the object is a file, the +data is copied from the lower to the upper filesystem. Finally any +extended attributes are copied up. + +Once the copy_up is complete, the overlay filesystem simply +provides direct access to the newly created file in the upper +filesystem - future operations on the file are barely noticed by the +overlay filesystem (though an operation on the name of the file such as +rename or unlink will of course be noticed and handled). + + +Non-standard behavior +--------------------- + +The copy_up operation essentially creates a new, identical file and +moves it over to the old name. The new file may be on a different +filesystem, so both st_dev and st_ino of the file may change. + +Any open files referring to this inode will access the old data and +metadata. Similarly any file locks obtained before copy_up will not +apply to the copied up file. + +On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and +fsetxattr(2) will fail with EROFS. + +If a file with multiple hard links is copied up, then this will +"break" the link. Changes will not be propagated to other names +referring to the same inode. + +Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory +object in overlayfs will not contain valid absolute paths, only +relative paths leading up to the filesystem's root. This will be +fixed in the future. + +Some operations are not atomic, for example a crash during copy_up or +rename will leave the filesystem in an inconsistent state. This will +be addressed in the future. + +Changes to underlying filesystems +--------------------------------- + +Offline changes, when the overlay is not mounted, are allowed to either +the upper or the lower trees. + +Changes to the underlying filesystems while part of a mounted overlay +filesystem are not allowed. If the underlying filesystem is changed, +the behavior of the overlay is undefined, though it will not result in +a crash or deadlock. diff --git a/MAINTAINERS b/MAINTAINERS index a20df9bf8ab0..aa974d445bfd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6832,6 +6832,13 @@ F: drivers/scsi/osd/ F: include/scsi/osd_* F: fs/exofs/ +OVERLAYFS FILESYSTEM +M: Miklos Szeredi +L: linux-fsdevel@vger.kernel.org +S: Supported +F: fs/overlayfs/* +F: Documentation/filesystems/overlayfs.txt + P54 WIRELESS DRIVER M: Christian Lamparter L: linux-wireless@vger.kernel.org -- cgit v1.2.3 From 69c433ed2ecd2d3264efd7afec4439524b319121 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 24 Oct 2014 00:14:39 +0200 Subject: fs: limit filesystem stacking depth Add a simple read-only counter to super_block that indicates how deep this is in the stack of filesystems. Previously ecryptfs was the only stackable filesystem and it explicitly disallowed multiple layers of itself. Overlayfs, however, can be stacked recursively and also may be stacked on top of ecryptfs or vice versa. To limit the kernel stack usage we must limit the depth of the filesystem stack. Initially the limit is set to 2. Signed-off-by: Miklos Szeredi --- fs/ecryptfs/main.c | 7 +++++++ fs/overlayfs/super.c | 9 +++++++++ include/linux/fs.h | 11 +++++++++++ 3 files changed, 27 insertions(+) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1b119d3bf924..c4cd1fd86cc2 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -566,6 +566,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; + s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; + + rc = -EINVAL; + if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); + goto out_free; + } inode = ecryptfs_get_inode(path.dentry->d_inode, s); rc = PTR_ERR(inode); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 7dcc24e84417..08b704cebfc4 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -677,6 +677,15 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) } ufs->lower_namelen = statfs.f_namelen; + sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, + lowerpath.mnt->mnt_sb->s_stack_depth) + 1; + + err = -EINVAL; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + goto out_put_workpath; + } + ufs->upper_mnt = clone_private_mount(&upperpath); err = PTR_ERR(ufs->upper_mnt); if (IS_ERR(ufs->upper_mnt)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 69118b3cb917..4e41a4a331bb 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -261,6 +261,12 @@ struct iattr { */ #include +/* + * Maximum number of layers of fs stack. Needs to be limited to + * prevent kernel stack overflow + */ +#define FILESYSTEM_MAX_STACK_DEPTH 2 + /** * enum positive_aop_returns - aop return codes with specific semantics * @@ -1273,6 +1279,11 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; + + /* + * Indicates how deep in a filesystem stack this SB is + */ + int s_stack_depth; }; extern struct timespec current_fs_time(struct super_block *sb); -- cgit v1.2.3 From 6fa88d9e0886ad646ef5706539fc314cbaccd690 Mon Sep 17 00:00:00 2001 From: Stefan Hengelein Date: Sun, 19 Oct 2014 20:04:26 +0200 Subject: MIPS: MSP71xx: Fix build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_MIPS_MT_SMP is enabled, the following compilation error occurs: arch/mips/pmcs-msp71xx/msp_irq_cic.c:134: error: ‘irq’ undeclared This code clearly never saw a compiler. The surrounding code suggests, that 'd->irq' was intended, not 'irq'. This error was found with vampyr. Signed-off-by: Stefan Hengelein Fixes: d7881fbdf866d7d0 ("MIPS: msp71xx: Convert to new irq_chip functions") Acked-by: Geert Uytterhoeven Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/8116/ Signed-off-by: Ralf Baechle --- arch/mips/pmcs-msp71xx/msp_irq_cic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/pmcs-msp71xx/msp_irq_cic.c b/arch/mips/pmcs-msp71xx/msp_irq_cic.c index b8df2f7b3328..1207ec4dfb77 100644 --- a/arch/mips/pmcs-msp71xx/msp_irq_cic.c +++ b/arch/mips/pmcs-msp71xx/msp_irq_cic.c @@ -131,11 +131,11 @@ static int msp_cic_irq_set_affinity(struct irq_data *d, int cpu; unsigned long flags; unsigned int mtflags; - unsigned long imask = (1 << (irq - MSP_CIC_INTBASE)); + unsigned long imask = (1 << (d->irq - MSP_CIC_INTBASE)); volatile u32 *cic_mask = (volatile u32 *)CIC_VPE0_MSK_REG; /* timer balancing should be disabled in kernel code */ - BUG_ON(irq == MSP_INT_VPE0_TIMER || irq == MSP_INT_VPE1_TIMER); + BUG_ON(d->irq == MSP_INT_VPE0_TIMER || d->irq == MSP_INT_VPE1_TIMER); LOCK_CORE(flags, mtflags); /* enable if any of each VPE's TCs require this IRQ */ -- cgit v1.2.3 From aedd153f5bb5b1f1d6d9142014f521ae2ec294cc Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Mon, 20 Oct 2014 09:39:31 +0100 Subject: MIPS: ftrace: Fix a microMIPS build problem Code before the .fixup section needs to have the .insn directive. This has no side effects on MIPS32/64 but it affects the way microMIPS loads the address for the return label. Fixes the following build problem: mips-linux-gnu-ld: arch/mips/built-in.o: .fixup+0x4a0: Unsupported jump between ISA modes; consider recompiling with interlinking enabled. mips-linux-gnu-ld: final link failed: Bad value Makefile:819: recipe for target 'vmlinux' failed The fix is similar to 1658f914ff91c3bf ("MIPS: microMIPS: Disable LL/SC and fix linker bug.") Signed-off-by: Markos Chandras Cc: stable@vger.kernel.org Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/8117/ Signed-off-by: Ralf Baechle --- arch/mips/include/asm/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/include/asm/ftrace.h b/arch/mips/include/asm/ftrace.h index 992aaba603b5..b463f2aa5a61 100644 --- a/arch/mips/include/asm/ftrace.h +++ b/arch/mips/include/asm/ftrace.h @@ -24,7 +24,7 @@ do { \ asm volatile ( \ "1: " load " %[tmp_dst], 0(%[tmp_src])\n" \ " li %[tmp_err], 0\n" \ - "2:\n" \ + "2: .insn\n" \ \ ".section .fixup, \"ax\"\n" \ "3: li %[tmp_err], 1\n" \ @@ -46,7 +46,7 @@ do { \ asm volatile ( \ "1: " store " %[tmp_src], 0(%[tmp_dst])\n"\ " li %[tmp_err], 0\n" \ - "2:\n" \ + "2: .insn\n" \ \ ".section .fixup, \"ax\"\n" \ "3: li %[tmp_err], 1\n" \ -- cgit v1.2.3 From 51486b900ee92856b977eacfc5bfbe6565028070 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Oct 2014 13:26:21 -0400 Subject: fix inode leaks on d_splice_alias() failure exits d_splice_alias() callers expect it to either stash the inode reference into a new alias, or drop the inode reference. That makes it possible to just return d_splice_alias() result from ->lookup() instance, without any extra housekeeping required. Unfortunately, that should include the failure exits. If d_splice_alias() returns an error, it leaves the dentry it has been given negative and thus it *must* drop the inode reference. Easily fixed, but it goes way back and will need backporting. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/dcache.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/dcache.c b/fs/dcache.c index d5a23fd0da90..3ffef7f4e5cd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2673,11 +2673,13 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (!IS_ROOT(new)) { spin_unlock(&inode->i_lock); dput(new); + iput(inode); return ERR_PTR(-EIO); } if (d_ancestor(new, dentry)) { spin_unlock(&inode->i_lock); dput(new); + iput(inode); return ERR_PTR(-EIO); } write_seqlock(&rename_lock); -- cgit v1.2.3 From fb54a645b2739fb196446ffbbbe3f3589d117b55 Mon Sep 17 00:00:00 2001 From: David Henningsson Date: Fri, 24 Oct 2014 10:00:38 +0200 Subject: ALSA: hda - Add missing terminating entry to SND_HDA_PIN_QUIRK macro Without this terminating entry, the pin matching would continue across random memory until a zero or a non-matching entry was found. The result being that in some cases, the pin quirk would not be applied correctly. Cc: stable@vger.kernel.org Signed-off-by: David Henningsson Signed-off-by: Takashi Iwai --- sound/pci/hda/hda_local.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/hda_local.h b/sound/pci/hda/hda_local.h index 7eb44e78e141..62658f2f8c9f 100644 --- a/sound/pci/hda/hda_local.h +++ b/sound/pci/hda/hda_local.h @@ -419,7 +419,7 @@ struct snd_hda_pin_quirk { .subvendor = _subvendor,\ .name = _name,\ .value = _value,\ - .pins = (const struct hda_pintbl[]) { _pins } \ + .pins = (const struct hda_pintbl[]) { _pins, {0, 0}} \ } #else @@ -427,7 +427,7 @@ struct snd_hda_pin_quirk { { .codec = _codec,\ .subvendor = _subvendor,\ .value = _value,\ - .pins = (const struct hda_pintbl[]) { _pins } \ + .pins = (const struct hda_pintbl[]) { _pins, {0, 0}} \ } #endif -- cgit v1.2.3 From 854e8bb1aa06c578c2c9145fa6bfe3680ef63b23 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 16 Sep 2014 03:24:05 +0300 Subject: KVM: x86: Check non-canonical addresses upon WRMSR Upon WRMSR, the CPU should inject #GP if a non-canonical value (address) is written to certain MSRs. The behavior is "almost" identical for AMD and Intel (ignoring MSRs that are not implemented in either architecture since they would anyhow #GP). However, IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if non-canonical address is written on Intel but not on AMD (which ignores the top 32-bits). Accordingly, this patch injects a #GP on the MSRs which behave identically on Intel and AMD. To eliminate the differences between the architecutres, the value which is written to IA32_SYSENTER_ESP and IA32_SYSENTER_EIP is turned to canonical value before writing instead of injecting a #GP. Some references from Intel and AMD manuals: According to Intel SDM description of WRMSR instruction #GP is expected on WRMSR "If the source register contains a non-canonical address and ECX specifies one of the following MSRs: IA32_DS_AREA, IA32_FS_BASE, IA32_GS_BASE, IA32_KERNEL_GS_BASE, IA32_LSTAR, IA32_SYSENTER_EIP, IA32_SYSENTER_ESP." According to AMD manual instruction manual: LSTAR/CSTAR (SYSCALL): "The WRMSR instruction loads the target RIP into the LSTAR and CSTAR registers. If an RIP written by WRMSR is not in canonical form, a general-protection exception (#GP) occurs." IA32_GS_BASE and IA32_FS_BASE (WRFSBASE/WRGSBASE): "The address written to the base field must be in canonical form or a #GP fault will occur." IA32_KERNEL_GS_BASE (SWAPGS): "The address stored in the KernelGSbase MSR must be in canonical form." This patch fixes CVE-2014-3610. Cc: stable@vger.kernel.org Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 14 ++++++++++++++ arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 27 ++++++++++++++++++++++++++- 4 files changed, 42 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7d603a71ab3a..ccc94de4ac49 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -989,6 +989,20 @@ static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } +static inline u64 get_canonical(u64 la) +{ + return ((int64_t)la << 16) >> 16; +} + +static inline bool is_noncanonical_address(u64 la) +{ +#ifdef CONFIG_X86_64 + return get_canonical(la) != la; +#else + return false; +#endif +} + #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 65510f624dfe..00bed2c5e948 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3251,7 +3251,7 @@ static int wrmsr_interception(struct vcpu_svm *svm) msr.host_initiated = false; svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, &msr)) { + if (kvm_set_msr(&svm->vcpu, &msr)) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(&svm->vcpu, 0); } else { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0acac81f198b..148020a7dd98 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5291,7 +5291,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) msr.data = data; msr.index = ecx; msr.host_initiated = false; - if (vmx_set_msr(vcpu, &msr) != 0) { + if (kvm_set_msr(vcpu, &msr) != 0) { trace_kvm_msr_write_ex(ecx, data); kvm_inject_gp(vcpu, 0); return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 34c8f94331f8..5a7195573a32 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -987,7 +987,6 @@ void kvm_enable_efer_bits(u64 mask) } EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); - /* * Writes msr value into into the appropriate "register". * Returns 0 on success, non-0 otherwise. @@ -995,8 +994,34 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); */ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { + switch (msr->index) { + case MSR_FS_BASE: + case MSR_GS_BASE: + case MSR_KERNEL_GS_BASE: + case MSR_CSTAR: + case MSR_LSTAR: + if (is_noncanonical_address(msr->data)) + return 1; + break; + case MSR_IA32_SYSENTER_EIP: + case MSR_IA32_SYSENTER_ESP: + /* + * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if + * non-canonical address is written on Intel but not on + * AMD (which ignores the top 32-bits, because it does + * not implement 64-bit SYSENTER). + * + * 64-bit code should hence be able to write a non-canonical + * value on AMD. Making the address canonical ensures that + * vmentry does not fail on Intel after writing a non-canonical + * value, and that something deterministic happens if the guest + * invokes 64-bit SYSENTER. + */ + msr->data = get_canonical(msr->data); + } return kvm_x86_ops->set_msr(vcpu, msr); } +EXPORT_SYMBOL_GPL(kvm_set_msr); /* * Adapt set_msr() to msr_io()'s calling convention -- cgit v1.2.3 From 8b3c3104c3f4f706e99365c3e0d2aa61b95f969f Mon Sep 17 00:00:00 2001 From: Andy Honig Date: Wed, 27 Aug 2014 11:16:44 -0700 Subject: KVM: x86: Prevent host from panicking on shared MSR writes. The previous patch blocked invalid writes directly when the MSR is written. As a precaution, prevent future similar mistakes by gracefulling handle GPs caused by writes to shared MSRs. Cc: stable@vger.kernel.org Signed-off-by: Andrew Honig [Remove parts obsoleted by Nadav's patch. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx.c | 7 +++++-- arch/x86/kvm/x86.c | 11 ++++++++--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ccc94de4ac49..6ed0c30d6a0c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1064,7 +1064,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, unsigned long address); void kvm_define_shared_msr(unsigned index, u32 msr); -void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 148020a7dd98..7e2c098b59c9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2659,12 +2659,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) default: msr = find_msr_entry(vmx, msr_index); if (msr) { + u64 old_msr_data = msr->data; msr->data = data; if (msr - vmx->guest_msrs < vmx->save_nmsrs) { preempt_disable(); - kvm_set_shared_msr(msr->index, msr->data, - msr->mask); + ret = kvm_set_shared_msr(msr->index, msr->data, + msr->mask); preempt_enable(); + if (ret) + msr->data = old_msr_data; } break; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a7195573a32..0033df32a745 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -229,20 +229,25 @@ static void kvm_shared_msr_cpu_online(void) shared_msr_update(i, shared_msrs_global.msrs[i]); } -void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) { unsigned int cpu = smp_processor_id(); struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu); + int err; if (((value ^ smsr->values[slot].curr) & mask) == 0) - return; + return 0; smsr->values[slot].curr = value; - wrmsrl(shared_msrs_global.msrs[slot], value); + err = wrmsrl_safe(shared_msrs_global.msrs[slot], value); + if (err) + return 1; + if (!smsr->registered) { smsr->urn.on_user_return = kvm_on_user_return; user_return_notifier_register(&smsr->urn); smsr->registered = true; } + return 0; } EXPORT_SYMBOL_GPL(kvm_set_shared_msr); -- cgit v1.2.3 From 2febc839133280d5a5e8e1179c94ea674489dae2 Mon Sep 17 00:00:00 2001 From: Andy Honig Date: Wed, 27 Aug 2014 14:42:54 -0700 Subject: KVM: x86: Improve thread safety in pit There's a race condition in the PIT emulation code in KVM. In __kvm_migrate_pit_timer the pit_timer object is accessed without synchronization. If the race condition occurs at the wrong time this can crash the host kernel. This fixes CVE-2014-3611. Cc: stable@vger.kernel.org Signed-off-by: Andrew Honig Signed-off-by: Paolo Bonzini --- arch/x86/kvm/i8254.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 518d86471b76..298781d4cfb4 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -262,8 +262,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) return; timer = &pit->pit_state.timer; + mutex_lock(&pit->pit_state.lock); if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + mutex_unlock(&pit->pit_state.lock); } static void destroy_pit_timer(struct kvm_pit *pit) -- cgit v1.2.3 From 05c83ec9b73c8124555b706f6af777b10adf0862 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 18 Sep 2014 22:39:37 +0300 Subject: KVM: x86: Fix wrong masking on relative jump/call Relative jumps and calls do the masking according to the operand size, and not according to the address size as the KVM emulator does today. This patch fixes KVM behavior. Cc: stable@vger.kernel.org Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a46207a05835..047698974799 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -504,11 +504,6 @@ static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc) masked_increment(reg_rmw(ctxt, VCPU_REGS_RSP), stack_mask(ctxt), inc); } -static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) -{ - register_address_increment(ctxt, &ctxt->_eip, rel); -} - static u32 desc_limit_scaled(struct desc_struct *desc) { u32 limit = get_desc_limit(desc); @@ -569,6 +564,28 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } +static inline void assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) +{ + switch (ctxt->op_bytes) { + case 2: + ctxt->_eip = (u16)dst; + break; + case 4: + ctxt->_eip = (u32)dst; + break; + case 8: + ctxt->_eip = dst; + break; + default: + WARN(1, "unsupported eip assignment size\n"); + } +} + +static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) +{ + assign_eip_near(ctxt, ctxt->_eip + rel); +} + static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) { u16 selector; -- cgit v1.2.3 From 234f3ce485d54017f15cf5e0699cff4100121601 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 18 Sep 2014 22:39:38 +0300 Subject: KVM: x86: Emulator fixes for eip canonical checks on near branches Before changing rip (during jmp, call, ret, etc.) the target should be asserted to be canonical one, as real CPUs do. During sysret, both target rsp and rip should be canonical. If any of these values is noncanonical, a #GP exception should occur. The exception to this rule are syscall and sysenter instructions in which the assigned rip is checked during the assignment to the relevant MSRs. This patch fixes the emulator to behave as real CPUs do for near branches. Far branches are handled by the next patch. This fixes CVE-2014-3647. Cc: stable@vger.kernel.org Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 78 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 24 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 047698974799..a1b9139169f6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -564,7 +564,8 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, NM_VECTOR, 0, false); } -static inline void assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) +static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst, + int cs_l) { switch (ctxt->op_bytes) { case 2: @@ -574,16 +575,25 @@ static inline void assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) ctxt->_eip = (u32)dst; break; case 8: + if ((cs_l && is_noncanonical_address(dst)) || + (!cs_l && (dst & ~(u32)-1))) + return emulate_gp(ctxt, 0); ctxt->_eip = dst; break; default: WARN(1, "unsupported eip assignment size\n"); } + return X86EMUL_CONTINUE; +} + +static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst) +{ + return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64); } -static inline void jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) +static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel) { - assign_eip_near(ctxt, ctxt->_eip + rel); + return assign_eip_near(ctxt, ctxt->_eip + rel); } static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) @@ -1998,13 +2008,15 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) case 2: /* call near abs */ { long int old_eip; old_eip = ctxt->_eip; - ctxt->_eip = ctxt->src.val; + rc = assign_eip_near(ctxt, ctxt->src.val); + if (rc != X86EMUL_CONTINUE) + break; ctxt->src.val = old_eip; rc = em_push(ctxt); break; } case 4: /* jmp abs */ - ctxt->_eip = ctxt->src.val; + rc = assign_eip_near(ctxt, ctxt->src.val); break; case 5: /* jmp far */ rc = em_jmp_far(ctxt); @@ -2039,10 +2051,14 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) static int em_ret(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->_eip; - ctxt->dst.bytes = ctxt->op_bytes; - return em_pop(ctxt); + int rc; + unsigned long eip; + + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + + return assign_eip_near(ctxt, eip); } static int em_ret_far(struct x86_emulate_ctxt *ctxt) @@ -2323,7 +2339,7 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; struct desc_struct cs, ss; - u64 msr_data; + u64 msr_data, rcx, rdx; int usermode; u16 cs_sel = 0, ss_sel = 0; @@ -2339,6 +2355,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) else usermode = X86EMUL_MODE_PROT32; + rcx = reg_read(ctxt, VCPU_REGS_RCX); + rdx = reg_read(ctxt, VCPU_REGS_RDX); + cs.dpl = 3; ss.dpl = 3; ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); @@ -2356,6 +2375,9 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ss_sel = cs_sel + 8; cs.d = 0; cs.l = 1; + if (is_noncanonical_address(rcx) || + is_noncanonical_address(rdx)) + return emulate_gp(ctxt, 0); break; } cs_sel |= SELECTOR_RPL_MASK; @@ -2364,8 +2386,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt) ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ctxt->_eip = reg_read(ctxt, VCPU_REGS_RDX); - *reg_write(ctxt, VCPU_REGS_RSP) = reg_read(ctxt, VCPU_REGS_RCX); + ctxt->_eip = rdx; + *reg_write(ctxt, VCPU_REGS_RSP) = rcx; return X86EMUL_CONTINUE; } @@ -2905,10 +2927,13 @@ static int em_aad(struct x86_emulate_ctxt *ctxt) static int em_call(struct x86_emulate_ctxt *ctxt) { + int rc; long rel = ctxt->src.val; ctxt->src.val = (unsigned long)ctxt->_eip; - jmp_rel(ctxt, rel); + rc = jmp_rel(ctxt, rel); + if (rc != X86EMUL_CONTINUE) + return rc; return em_push(ctxt); } @@ -2940,11 +2965,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) { int rc; + unsigned long eip; - ctxt->dst.type = OP_REG; - ctxt->dst.addr.reg = &ctxt->_eip; - ctxt->dst.bytes = ctxt->op_bytes; - rc = emulate_pop(ctxt, &ctxt->dst.val, ctxt->op_bytes); + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = assign_eip_near(ctxt, eip); if (rc != X86EMUL_CONTINUE) return rc; rsp_increment(ctxt, ctxt->src.val); @@ -3271,20 +3297,24 @@ static int em_lmsw(struct x86_emulate_ctxt *ctxt) static int em_loop(struct x86_emulate_ctxt *ctxt) { + int rc = X86EMUL_CONTINUE; + register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1); if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) && (ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags))) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); - return X86EMUL_CONTINUE; + return rc; } static int em_jcxz(struct x86_emulate_ctxt *ctxt) { + int rc = X86EMUL_CONTINUE; + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); - return X86EMUL_CONTINUE; + return rc; } static int em_in(struct x86_emulate_ctxt *ctxt) @@ -4743,7 +4773,7 @@ special_insn: break; case 0x70 ... 0x7f: /* jcc (short) */ if (test_cc(ctxt->b, ctxt->eflags)) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x8d: /* lea r16/r32, m */ ctxt->dst.val = ctxt->src.addr.mem.ea; @@ -4773,7 +4803,7 @@ special_insn: break; case 0xe9: /* jmp rel */ case 0xeb: /* jmp rel short */ - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); ctxt->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ @@ -4898,7 +4928,7 @@ twobyte_insn: break; case 0x80 ... 0x8f: /* jnz rel, etc*/ if (test_cc(ctxt->b, ctxt->eflags)) - jmp_rel(ctxt, ctxt->src.val); + rc = jmp_rel(ctxt, ctxt->src.val); break; case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); -- cgit v1.2.3 From d1442d85cc30ea75f7d399474ca738e0bc96f715 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 18 Sep 2014 22:39:39 +0300 Subject: KVM: x86: Handle errors when RIP is set during far jumps Far jmp/call/ret may fault while loading a new RIP. Currently KVM does not handle this case, and may result in failed vm-entry once the assignment is done. The tricky part of doing so is that loading the new CS affects the VMCS/VMCB state, so if we fail during loading the new RIP, we are left in unconsistent state. Therefore, this patch saves on 64-bit the old CS descriptor and restores it if loading RIP failed. This fixes CVE-2014-3647. Cc: stable@vger.kernel.org Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 118 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 88 insertions(+), 30 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a1b9139169f6..c0deaff8d9f0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1443,7 +1443,9 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, /* Does not support long mode */ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, - u16 selector, int seg, u8 cpl, bool in_task_switch) + u16 selector, int seg, u8 cpl, + bool in_task_switch, + struct desc_struct *desc) { struct desc_struct seg_desc, old_desc; u8 dpl, rpl; @@ -1584,6 +1586,8 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, } load: ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); + if (desc) + *desc = seg_desc; return X86EMUL_CONTINUE; exception: return emulate_exception(ctxt, err_vec, err_code, true); @@ -1593,7 +1597,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, u16 selector, int seg) { u8 cpl = ctxt->ops->cpl(ctxt); - return __load_segment_descriptor(ctxt, selector, seg, cpl, false); + return __load_segment_descriptor(ctxt, selector, seg, cpl, false, NULL); } static void write_register_operand(struct operand *op) @@ -1987,17 +1991,31 @@ static int em_iret(struct x86_emulate_ctxt *ctxt) static int em_jmp_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned short sel; + unsigned short sel, old_sel; + struct desc_struct old_desc, new_desc; + const struct x86_emulate_ops *ops = ctxt->ops; + u8 cpl = ctxt->ops->cpl(ctxt); + + /* Assignment of RIP may only fail in 64-bit mode */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + ops->get_segment(ctxt, &old_sel, &old_desc, NULL, + VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - rc = load_segment_descriptor(ctxt, sel, VCPU_SREG_CS); + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, + &new_desc); if (rc != X86EMUL_CONTINUE) return rc; - ctxt->_eip = 0; - memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); - return X86EMUL_CONTINUE; + rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); + if (rc != X86EMUL_CONTINUE) { + WARN_ON(!ctxt->mode != X86EMUL_MODE_PROT64); + /* assigning eip failed; restore the old cs */ + ops->set_segment(ctxt, old_sel, &old_desc, 0, VCPU_SREG_CS); + return rc; + } + return rc; } static int em_grp45(struct x86_emulate_ctxt *ctxt) @@ -2064,21 +2082,34 @@ static int em_ret(struct x86_emulate_ctxt *ctxt) static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; - unsigned long cs; + unsigned long eip, cs; + u16 old_cs; int cpl = ctxt->ops->cpl(ctxt); + struct desc_struct old_desc, new_desc; + const struct x86_emulate_ops *ops = ctxt->ops; + + if (ctxt->mode == X86EMUL_MODE_PROT64) + ops->get_segment(ctxt, &old_cs, &old_desc, NULL, + VCPU_SREG_CS); - rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); + rc = emulate_pop(ctxt, &eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - if (ctxt->op_bytes == 4) - ctxt->_eip = (u32)ctxt->_eip; rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; /* Outer-privilege level return is not implemented */ if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) return X86EMUL_UNHANDLEABLE; - rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); + rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false, + &new_desc); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = assign_eip_far(ctxt, eip, new_desc.l); + if (rc != X86EMUL_CONTINUE) { + WARN_ON(!ctxt->mode != X86EMUL_MODE_PROT64); + ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); + } return rc; } @@ -2505,19 +2536,24 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * Now load segment descriptors. If fault happens at this stage * it is handled in a context of new task */ - ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ldt, VCPU_SREG_LDTR, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2642,25 +2678,32 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ - ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ldt_selector, VCPU_SREG_LDTR, + cpl, true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->es, VCPU_SREG_ES, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->cs, VCPU_SREG_CS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ss, VCPU_SREG_SS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->ds, VCPU_SREG_DS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->fs, VCPU_SREG_FS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; - ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, true); + ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl, + true, NULL); if (ret != X86EMUL_CONTINUE) return ret; @@ -2942,24 +2985,39 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) u16 sel, old_cs; ulong old_eip; int rc; + struct desc_struct old_desc, new_desc; + const struct x86_emulate_ops *ops = ctxt->ops; + int cpl = ctxt->ops->cpl(ctxt); - old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); old_eip = ctxt->_eip; + ops->get_segment(ctxt, &old_cs, &old_desc, NULL, VCPU_SREG_CS); memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2); - if (load_segment_descriptor(ctxt, sel, VCPU_SREG_CS)) + rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl, false, + &new_desc); + if (rc != X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - ctxt->_eip = 0; - memcpy(&ctxt->_eip, ctxt->src.valptr, ctxt->op_bytes); + rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l); + if (rc != X86EMUL_CONTINUE) + goto fail; ctxt->src.val = old_cs; rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) - return rc; + goto fail; ctxt->src.val = old_eip; - return em_push(ctxt); + rc = em_push(ctxt); + /* If we failed, we tainted the memory, but the very least we should + restore cs */ + if (rc != X86EMUL_CONTINUE) + goto fail; + return rc; +fail: + ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS); + return rc; + } static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) -- cgit v1.2.3 From a642fc305053cc1c6e47e4f4df327895747ab485 Mon Sep 17 00:00:00 2001 From: Petr Matousek Date: Tue, 23 Sep 2014 20:22:30 +0200 Subject: kvm: vmx: handle invvpid vm exit gracefully On systems with invvpid instruction support (corresponding bit in IA32_VMX_EPT_VPID_CAP MSR is set) guest invocation of invvpid causes vm exit, which is currently not handled and results in propagation of unknown exit to userspace. Fix this by installing an invvpid vm exit handler. This is CVE-2014-3646. Cc: stable@vger.kernel.org Signed-off-by: Petr Matousek Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/vmx.h | 2 ++ arch/x86/kvm/vmx.c | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 0e79420376eb..990a2fe1588d 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -67,6 +67,7 @@ #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 #define EXIT_REASON_PREEMPTION_TIMER 52 +#define EXIT_REASON_INVVPID 53 #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 @@ -114,6 +115,7 @@ { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ { EXIT_REASON_INVD, "INVD" }, \ + { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_INVPCID, "INVPCID" } #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7e2c098b59c9..cf3cd079ec52 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6746,6 +6746,12 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; } +static int handle_invvpid(struct kvm_vcpu *vcpu) +{ + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -6791,6 +6797,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_MWAIT_INSTRUCTION] = handle_mwait, [EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor, [EXIT_REASON_INVEPT] = handle_invept, + [EXIT_REASON_INVVPID] = handle_invvpid, }; static const int kvm_vmx_max_exit_handlers = @@ -7026,7 +7033,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD: case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE: case EXIT_REASON_VMOFF: case EXIT_REASON_VMON: - case EXIT_REASON_INVEPT: + case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID: /* * VMX instructions trap unconditionally. This allows L1 to * emulate them for its L2 guest, i.e., allows 3-level nesting! -- cgit v1.2.3 From 2bc19dc3754fc066c43799659f0d848631c44cfe Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 18 Sep 2014 16:21:16 +0300 Subject: kvm: x86: don't kill guest on unknown exit reason KVM_EXIT_UNKNOWN is a kvm bug, we don't really know whether it was triggered by a priveledged application. Let's not kill the guest: WARN and inject #UD instead. Cc: stable@vger.kernel.org Signed-off-by: Michael S. Tsirkin Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 6 +++--- arch/x86/kvm/vmx.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 00bed2c5e948..7527cefc5a43 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3551,9 +3551,9 @@ static int handle_exit(struct kvm_vcpu *vcpu) if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_code; - return 0; + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } return svm_exit_handlers[exit_code](svm); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index cf3cd079ec52..a8b76c4c95e2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7174,10 +7174,10 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = exit_reason; + WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_reason); + kvm_queue_exception(vcpu, UD_VECTOR); + return 1; } - return 0; } static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) -- cgit v1.2.3 From 08da44aedba0f493e10695fa334348a7a4f72eb3 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Fri, 3 Oct 2014 01:10:04 +0300 Subject: KVM: x86: Decoding guest instructions which cross page boundary may fail Once an instruction crosses a page boundary, the size read from the second page disregards the common case that part of the operand resides on the first page. As a result, fetch of long insturctions may fail, and thereby cause the decoding to fail as well. Cc: stable@vger.kernel.org Fixes: 5cfc7e0f5e5e1adf998df94f8e36edaf5d30d38e Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c0deaff8d9f0..02c8ea804aaf 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -778,8 +778,10 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, unsigned size) { - if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size)) - return __do_insn_fetch_bytes(ctxt, size); + unsigned done_size = ctxt->fetch.end - ctxt->fetch.ptr; + + if (unlikely(done_size < size)) + return __do_insn_fetch_bytes(ctxt, size - done_size); else return X86EMUL_CONTINUE; } -- cgit v1.2.3 From cc08d25a88a396798ebf3ab659fe684add368285 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Fri, 24 Oct 2014 13:27:37 +0200 Subject: MIPS: SEAD3: Nuke PIC32 I2C driver. A platform driver for which nothing ever registers the corresponding platform device. Also it was driving the same hardware as sead3-i2c-drv.c so redundant anyway and couldn't co-exist with that driver because each of them was using a private spinlock to protect access to the same hardware resources. This also fixes a randconfig problem: arch/mips/mti-sead3/sead3-pic32-i2c-drv.c: In function 'i2c_platform_probe': arch/mips/mti-sead3/sead3-pic32-i2c-drv.c:345:2: error: implicit declaration of function 'i2c_add_numbered_adapter' [-Werror=implicit-function-declaration] ret = i2c_add_numbered_adapter(&priv->adap); ^ arch/mips/mti-sead3/sead3-pic32-i2c-drv.c: In function 'i2c_platform_remove': arch/mips/mti-sead3/sead3-pic32-i2c-drv.c:361:2: error: implicit declaration of function 'i2c_del_adapter' [-Werror=implicit-function-declaration] i2c_del_adapter(&priv->adap); Signed-off-by: Ralf Baechle --- arch/mips/mti-sead3/Makefile | 1 - arch/mips/mti-sead3/sead3-pic32-bus.c | 102 ------- arch/mips/mti-sead3/sead3-pic32-i2c-drv.c | 423 ------------------------------ 3 files changed, 526 deletions(-) delete mode 100644 arch/mips/mti-sead3/sead3-pic32-bus.c delete mode 100644 arch/mips/mti-sead3/sead3-pic32-i2c-drv.c diff --git a/arch/mips/mti-sead3/Makefile b/arch/mips/mti-sead3/Makefile index febf4334545e..2ae49e99eb67 100644 --- a/arch/mips/mti-sead3/Makefile +++ b/arch/mips/mti-sead3/Makefile @@ -14,7 +14,6 @@ obj-y := sead3-lcd.o sead3-display.o sead3-init.o \ sead3-setup.o sead3-time.o obj-y += sead3-i2c-dev.o sead3-i2c.o \ - sead3-pic32-i2c-drv.o sead3-pic32-bus.o \ leds-sead3.o sead3-leds.o obj-$(CONFIG_EARLY_PRINTK) += sead3-console.o diff --git a/arch/mips/mti-sead3/sead3-pic32-bus.c b/arch/mips/mti-sead3/sead3-pic32-bus.c deleted file mode 100644 index 3b12aa5a7c88..000000000000 --- a/arch/mips/mti-sead3/sead3-pic32-bus.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. - */ -#include -#include -#include -#include -#include - -#define PIC32_NULL 0x00 -#define PIC32_RD 0x01 -#define PIC32_SYSRD 0x02 -#define PIC32_WR 0x10 -#define PIC32_SYSWR 0x20 -#define PIC32_IRQ_CLR 0x40 -#define PIC32_STATUS 0x80 - -#define DELAY() udelay(100) /* FIXME: needed? */ - -/* spinlock to ensure atomic access to PIC32 */ -static DEFINE_SPINLOCK(pic32_bus_lock); - -/* FIXME: io_remap these */ -static void __iomem *bus_xfer = (void __iomem *)0xbf000600; -static void __iomem *bus_status = (void __iomem *)0xbf000060; - -static inline unsigned int ioready(void) -{ - return readl(bus_status) & 1; -} - -static inline void wait_ioready(void) -{ - do { } while (!ioready()); -} - -static inline void wait_ioclear(void) -{ - do { } while (ioready()); -} - -static inline void check_ioclear(void) -{ - if (ioready()) { - pr_debug("ioclear: initially busy\n"); - do { - (void) readl(bus_xfer); - DELAY(); - } while (ioready()); - pr_debug("ioclear: cleared busy\n"); - } -} - -u32 pic32_bus_readl(u32 reg) -{ - unsigned long flags; - u32 status, val; - - spin_lock_irqsave(&pic32_bus_lock, flags); - - check_ioclear(); - - writel((PIC32_RD << 24) | (reg & 0x00ffffff), bus_xfer); - DELAY(); - wait_ioready(); - status = readl(bus_xfer); - DELAY(); - val = readl(bus_xfer); - wait_ioclear(); - - pr_debug("pic32_bus_readl: *%x -> %x (status=%x)\n", reg, val, status); - - spin_unlock_irqrestore(&pic32_bus_lock, flags); - - return val; -} - -void pic32_bus_writel(u32 val, u32 reg) -{ - unsigned long flags; - u32 status; - - spin_lock_irqsave(&pic32_bus_lock, flags); - - check_ioclear(); - - writel((PIC32_WR << 24) | (reg & 0x00ffffff), bus_xfer); - DELAY(); - writel(val, bus_xfer); - DELAY(); - wait_ioready(); - status = readl(bus_xfer); - wait_ioclear(); - - pr_debug("pic32_bus_writel: *%x <- %x (status=%x)\n", reg, val, status); - - spin_unlock_irqrestore(&pic32_bus_lock, flags); -} diff --git a/arch/mips/mti-sead3/sead3-pic32-i2c-drv.c b/arch/mips/mti-sead3/sead3-pic32-i2c-drv.c deleted file mode 100644 index 80fe194cfa53..000000000000 --- a/arch/mips/mti-sead3/sead3-pic32-i2c-drv.c +++ /dev/null @@ -1,423 +0,0 @@ -/* - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define PIC32_I2CxCON 0x0000 -#define PIC32_I2CxCONCLR 0x0004 -#define PIC32_I2CxCONSET 0x0008 -#define PIC32_I2CxCONINV 0x000C -#define I2CCON_ON (1<<15) -#define I2CCON_FRZ (1<<14) -#define I2CCON_SIDL (1<<13) -#define I2CCON_SCLREL (1<<12) -#define I2CCON_STRICT (1<<11) -#define I2CCON_A10M (1<<10) -#define I2CCON_DISSLW (1<<9) -#define I2CCON_SMEN (1<<8) -#define I2CCON_GCEN (1<<7) -#define I2CCON_STREN (1<<6) -#define I2CCON_ACKDT (1<<5) -#define I2CCON_ACKEN (1<<4) -#define I2CCON_RCEN (1<<3) -#define I2CCON_PEN (1<<2) -#define I2CCON_RSEN (1<<1) -#define I2CCON_SEN (1<<0) - -#define PIC32_I2CxSTAT 0x0010 -#define PIC32_I2CxSTATCLR 0x0014 -#define PIC32_I2CxSTATSET 0x0018 -#define PIC32_I2CxSTATINV 0x001C -#define I2CSTAT_ACKSTAT (1<<15) -#define I2CSTAT_TRSTAT (1<<14) -#define I2CSTAT_BCL (1<<10) -#define I2CSTAT_GCSTAT (1<<9) -#define I2CSTAT_ADD10 (1<<8) -#define I2CSTAT_IWCOL (1<<7) -#define I2CSTAT_I2COV (1<<6) -#define I2CSTAT_DA (1<<5) -#define I2CSTAT_P (1<<4) -#define I2CSTAT_S (1<<3) -#define I2CSTAT_RW (1<<2) -#define I2CSTAT_RBF (1<<1) -#define I2CSTAT_TBF (1<<0) - -#define PIC32_I2CxADD 0x0020 -#define PIC32_I2CxADDCLR 0x0024 -#define PIC32_I2CxADDSET 0x0028 -#define PIC32_I2CxADDINV 0x002C -#define PIC32_I2CxMSK 0x0030 -#define PIC32_I2CxMSKCLR 0x0034 -#define PIC32_I2CxMSKSET 0x0038 -#define PIC32_I2CxMSKINV 0x003C -#define PIC32_I2CxBRG 0x0040 -#define PIC32_I2CxBRGCLR 0x0044 -#define PIC32_I2CxBRGSET 0x0048 -#define PIC32_I2CxBRGINV 0x004C -#define PIC32_I2CxTRN 0x0050 -#define PIC32_I2CxTRNCLR 0x0054 -#define PIC32_I2CxTRNSET 0x0058 -#define PIC32_I2CxTRNINV 0x005C -#define PIC32_I2CxRCV 0x0060 - -struct i2c_platform_data { - u32 base; - struct i2c_adapter adap; - u32 xfer_timeout; - u32 ack_timeout; - u32 ctl_timeout; -}; - -extern u32 pic32_bus_readl(u32 reg); -extern void pic32_bus_writel(u32 val, u32 reg); - -static inline void -StartI2C(struct i2c_platform_data *adap) -{ - pr_debug("StartI2C\n"); - pic32_bus_writel(I2CCON_SEN, adap->base + PIC32_I2CxCONSET); -} - -static inline void -StopI2C(struct i2c_platform_data *adap) -{ - pr_debug("StopI2C\n"); - pic32_bus_writel(I2CCON_PEN, adap->base + PIC32_I2CxCONSET); -} - -static inline void -AckI2C(struct i2c_platform_data *adap) -{ - pr_debug("AckI2C\n"); - pic32_bus_writel(I2CCON_ACKDT, adap->base + PIC32_I2CxCONCLR); - pic32_bus_writel(I2CCON_ACKEN, adap->base + PIC32_I2CxCONSET); -} - -static inline void -NotAckI2C(struct i2c_platform_data *adap) -{ - pr_debug("NakI2C\n"); - pic32_bus_writel(I2CCON_ACKDT, adap->base + PIC32_I2CxCONSET); - pic32_bus_writel(I2CCON_ACKEN, adap->base + PIC32_I2CxCONSET); -} - -static inline int -IdleI2C(struct i2c_platform_data *adap) -{ - int i; - - pr_debug("IdleI2C\n"); - for (i = 0; i < adap->ctl_timeout; i++) { - if (((pic32_bus_readl(adap->base + PIC32_I2CxCON) & - (I2CCON_ACKEN | I2CCON_RCEN | I2CCON_PEN | I2CCON_RSEN | - I2CCON_SEN)) == 0) && - ((pic32_bus_readl(adap->base + PIC32_I2CxSTAT) & - (I2CSTAT_TRSTAT)) == 0)) - return 0; - udelay(1); - } - return -ETIMEDOUT; -} - -static inline u32 -MasterWriteI2C(struct i2c_platform_data *adap, u32 byte) -{ - pr_debug("MasterWriteI2C\n"); - - pic32_bus_writel(byte, adap->base + PIC32_I2CxTRN); - - return pic32_bus_readl(adap->base + PIC32_I2CxSTAT) & I2CSTAT_IWCOL; -} - -static inline u32 -MasterReadI2C(struct i2c_platform_data *adap) -{ - pr_debug("MasterReadI2C\n"); - - pic32_bus_writel(I2CCON_RCEN, adap->base + PIC32_I2CxCONSET); - - while (pic32_bus_readl(adap->base + PIC32_I2CxCON) & I2CCON_RCEN) - ; - - pic32_bus_writel(I2CSTAT_I2COV, adap->base + PIC32_I2CxSTATCLR); - - return pic32_bus_readl(adap->base + PIC32_I2CxRCV); -} - -static int -do_address(struct i2c_platform_data *adap, unsigned int addr, int rd) -{ - pr_debug("doaddress\n"); - - IdleI2C(adap); - StartI2C(adap); - IdleI2C(adap); - - addr <<= 1; - if (rd) - addr |= 1; - - if (MasterWriteI2C(adap, addr)) - return -EIO; - IdleI2C(adap); - if (pic32_bus_readl(adap->base + PIC32_I2CxSTAT) & I2CSTAT_ACKSTAT) - return -EIO; - return 0; -} - -static int -i2c_read(struct i2c_platform_data *adap, unsigned char *buf, - unsigned int len) -{ - int i; - u32 data; - - pr_debug("i2c_read\n"); - - i = 0; - while (i < len) { - data = MasterReadI2C(adap); - buf[i++] = data; - if (i < len) - AckI2C(adap); - else - NotAckI2C(adap); - } - - StopI2C(adap); - IdleI2C(adap); - return 0; -} - -static int -i2c_write(struct i2c_platform_data *adap, unsigned char *buf, - unsigned int len) -{ - int i; - u32 data; - - pr_debug("i2c_write\n"); - - i = 0; - while (i < len) { - data = buf[i]; - if (MasterWriteI2C(adap, data)) - return -EIO; - IdleI2C(adap); - if (pic32_bus_readl(adap->base + PIC32_I2CxSTAT) & - I2CSTAT_ACKSTAT) - return -EIO; - i++; - } - - StopI2C(adap); - IdleI2C(adap); - return 0; -} - -static int -platform_xfer(struct i2c_adapter *i2c_adap, struct i2c_msg *msgs, int num) -{ - struct i2c_platform_data *adap = i2c_adap->algo_data; - struct i2c_msg *p; - int i, err = 0; - - pr_debug("platform_xfer\n"); - for (i = 0; i < num; i++) { -#define __BUFSIZE 80 - int ii; - static char buf[__BUFSIZE]; - char *b = buf; - - p = &msgs[i]; - b += sprintf(buf, " [%d bytes]", p->len); - if ((p->flags & I2C_M_RD) == 0) { - for (ii = 0; ii < p->len; ii++) { - if (b < &buf[__BUFSIZE-4]) { - b += sprintf(b, " %02x", p->buf[ii]); - } else { - strcat(b, "..."); - break; - } - } - } - pr_debug("xfer%d: DevAddr: %04x Op:%s Data:%s\n", i, p->addr, - (p->flags & I2C_M_RD) ? "Rd" : "Wr", buf); - } - - - for (i = 0; !err && i < num; i++) { - p = &msgs[i]; - err = do_address(adap, p->addr, p->flags & I2C_M_RD); - if (err || !p->len) - continue; - if (p->flags & I2C_M_RD) - err = i2c_read(adap, p->buf, p->len); - else - err = i2c_write(adap, p->buf, p->len); - } - - /* Return the number of messages processed, or the error code. */ - if (err == 0) - err = num; - - return err; -} - -static u32 -platform_func(struct i2c_adapter *adap) -{ - pr_debug("platform_algo\n"); - return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL; -} - -static const struct i2c_algorithm platform_algo = { - .master_xfer = platform_xfer, - .functionality = platform_func, -}; - -static void i2c_platform_setup(struct i2c_platform_data *priv) -{ - pr_debug("i2c_platform_setup\n"); - - pic32_bus_writel(500, priv->base + PIC32_I2CxBRG); - pic32_bus_writel(I2CCON_ON, priv->base + PIC32_I2CxCONCLR); - pic32_bus_writel(I2CCON_ON, priv->base + PIC32_I2CxCONSET); - pic32_bus_writel((I2CSTAT_BCL | I2CSTAT_IWCOL), - (priv->base + PIC32_I2CxSTATCLR)); -} - -static void i2c_platform_disable(struct i2c_platform_data *priv) -{ - pr_debug("i2c_platform_disable\n"); -} - -static int i2c_platform_probe(struct platform_device *pdev) -{ - struct i2c_platform_data *priv; - struct resource *r; - int ret; - - pr_debug("i2c_platform_probe\n"); - r = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!r) - return -ENODEV; - - priv = devm_kzalloc(&pdev->dev, sizeof(struct i2c_platform_data), - GFP_KERNEL); - if (!priv) - return -ENOMEM; - - /* FIXME: need to allocate resource in PIC32 space */ -#if 0 - priv->base = bus_request_region(r->start, resource_size(r), - pdev->name); -#else - priv->base = r->start; -#endif - if (!priv->base) - return -EBUSY; - - priv->xfer_timeout = 200; - priv->ack_timeout = 200; - priv->ctl_timeout = 200; - - priv->adap.nr = pdev->id; - priv->adap.algo = &platform_algo; - priv->adap.algo_data = priv; - priv->adap.dev.parent = &pdev->dev; - strlcpy(priv->adap.name, "PIC32 I2C", sizeof(priv->adap.name)); - - i2c_platform_setup(priv); - - ret = i2c_add_numbered_adapter(&priv->adap); - if (ret) { - i2c_platform_disable(priv); - return ret; - } - - platform_set_drvdata(pdev, priv); - return 0; -} - -static int i2c_platform_remove(struct platform_device *pdev) -{ - struct i2c_platform_data *priv = platform_get_drvdata(pdev); - - pr_debug("i2c_platform_remove\n"); - platform_set_drvdata(pdev, NULL); - i2c_del_adapter(&priv->adap); - i2c_platform_disable(priv); - return 0; -} - -#ifdef CONFIG_PM -static int -i2c_platform_suspend(struct platform_device *pdev, pm_message_t state) -{ - struct i2c_platform_data *priv = platform_get_drvdata(pdev); - - dev_dbg(&pdev->dev, "i2c_platform_disable\n"); - i2c_platform_disable(priv); - - return 0; -} - -static int -i2c_platform_resume(struct platform_device *pdev) -{ - struct i2c_platform_data *priv = platform_get_drvdata(pdev); - - dev_dbg(&pdev->dev, "i2c_platform_setup\n"); - i2c_platform_setup(priv); - - return 0; -} -#else -#define i2c_platform_suspend NULL -#define i2c_platform_resume NULL -#endif - -static struct platform_driver i2c_platform_driver = { - .driver = { - .name = "i2c_pic32", - .owner = THIS_MODULE, - }, - .probe = i2c_platform_probe, - .remove = i2c_platform_remove, - .suspend = i2c_platform_suspend, - .resume = i2c_platform_resume, -}; - -static int __init -i2c_platform_init(void) -{ - pr_debug("i2c_platform_init\n"); - return platform_driver_register(&i2c_platform_driver); -} - -static void __exit -i2c_platform_exit(void) -{ - pr_debug("i2c_platform_exit\n"); - platform_driver_unregister(&i2c_platform_driver); -} - -MODULE_AUTHOR("Chris Dearman, MIPS Technologies INC."); -MODULE_DESCRIPTION("PIC32 I2C driver"); -MODULE_LICENSE("GPL"); - -module_init(i2c_platform_init); -module_exit(i2c_platform_exit); -- cgit v1.2.3 From a430c9166312e1aa3d80bce32374233bdbfeba32 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 23 Oct 2014 14:54:14 +0200 Subject: KVM: emulate: avoid accessing NULL ctxt->memopp A failure to decode the instruction can cause a NULL pointer access. This is fixed simply by moving the "done" label as close as possible to the return. This fixes CVE-2014-8481. Reported-by: Andy Lutomirski Cc: stable@vger.kernel.org Fixes: 41061cdb98a0bec464278b4db8e894a3121671f5 Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 02c8ea804aaf..eb3b1c46f995 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4580,10 +4580,10 @@ done_prefixes: /* Decode and fetch the destination operand: register or memory. */ rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); -done: if (ctxt->rip_relative) ctxt->memopp->addr.mem.ea += ctxt->_eip; +done: return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } -- cgit v1.2.3 From 13e457e0eebf0a0c82c38ceb890d93eb826d62a6 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 13 Oct 2014 13:04:13 +0300 Subject: KVM: x86: Emulator does not decode clflush well Currently, all group15 instructions are decoded as clflush (e.g., mfence, xsave). In addition, the clflush instruction requires no prefix (66/f2/f3) would exist. If prefix exists it may encode a different instruction (e.g., clflushopt). Creating a group for clflush, and different group for each prefix. This has been the case forever, but the next patch needs the cflush group in order to fix a bug introduced in 3.17. Fixes: 41061cdb98a0bec464278b4db8e894a3121671f5 Cc: stable@vger.kernel.org Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index eb3b1c46f995..97da5034d812 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3462,6 +3462,12 @@ static int em_bswap(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_clflush(struct x86_emulate_ctxt *ctxt) +{ + /* emulating clflush regardless of cpuid */ + return X86EMUL_CONTINUE; +} + static bool valid_cr(int nr) { switch (nr) { @@ -3800,6 +3806,16 @@ static const struct opcode group11[] = { X7(D(Undefined)), }; +static const struct gprefix pfx_0f_ae_7 = { + I(0, em_clflush), N, N, N, +}; + +static const struct group_dual group15 = { { + N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), +}, { + N, N, N, N, N, N, N, N, +} }; + static const struct gprefix pfx_0f_6f_0f_7f = { I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov), }; @@ -4063,7 +4079,7 @@ static const struct opcode twobyte_table[256] = { F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts), F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd), F(DstMem | SrcReg | Src2CL | ModRM, em_shrd), - D(ModRM), F(DstReg | SrcMem | ModRM, em_imul), + GD(0, &group15), F(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ I2bv(DstMem | SrcReg | ModRM | Lock | PageTable, em_cmpxchg), I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), @@ -4993,8 +5009,6 @@ twobyte_insn: case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xae: /* clflush */ - break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; ctxt->dst.val = (ctxt->src.bytes == 1) ? (u8) ctxt->src.val -- cgit v1.2.3 From 3f6f1480d86bf9fc16c160d803ab1d006e3058d5 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 13 Oct 2014 13:04:14 +0300 Subject: KVM: x86: PREFETCH and HINT_NOP should have SrcMem flag The decode phase of the x86 emulator assumes that every instruction with the ModRM flag, and which can be used with RIP-relative addressing, has either SrcMem or DstMem. This is not the case for several instructions - prefetch, hint-nop and clflush. Adding SrcMem|NoAccess for prefetch and hint-nop and SrcMem for clflush. This fixes CVE-2014-8480. Fixes: 41061cdb98a0bec464278b4db8e894a3121671f5 Cc: stable@vger.kernel.org Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 97da5034d812..749f9fa38254 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3807,7 +3807,7 @@ static const struct opcode group11[] = { }; static const struct gprefix pfx_0f_ae_7 = { - I(0, em_clflush), N, N, N, + I(SrcMem | ByteOp, em_clflush), N, N, N, }; static const struct group_dual group15 = { { @@ -4024,10 +4024,11 @@ static const struct opcode twobyte_table[256] = { N, I(ImplicitOps | EmulateOnUD, em_syscall), II(ImplicitOps | Priv, em_clts, clts), N, DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, - N, D(ImplicitOps | ModRM), N, N, + N, D(ImplicitOps | ModRM | SrcMem | NoAccess), N, N, /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, - D(ImplicitOps | ModRM), N, N, N, N, N, N, D(ImplicitOps | ModRM), + D(ImplicitOps | ModRM | SrcMem | NoAccess), + N, N, N, N, N, N, D(ImplicitOps | ModRM | SrcMem | NoAccess), /* 0x20 - 0x2F */ DIP(ModRM | DstMem | Priv | Op3264 | NoMod, cr_read, check_cr_read), DIP(ModRM | DstMem | Priv | Op3264 | NoMod, dr_read, check_dr_read), -- cgit v1.2.3 From 3d32e4dbe71374a6780eaf51d719d76f9a9bf22f Mon Sep 17 00:00:00 2001 From: Quentin Casasnovas Date: Fri, 17 Oct 2014 22:55:59 +0200 Subject: kvm: fix excessive pages un-pinning in kvm_iommu_map error path. The third parameter of kvm_unpin_pages() when called from kvm_iommu_map_pages() is wrong, it should be the number of pages to un-pin and not the page size. This error was facilitated with an inconsistent API: kvm_pin_pages() takes a size, but kvn_unpin_pages() takes a number of pages, so fix the problem by matching the two. This was introduced by commit 350b8bd ("kvm: iommu: fix the third parameter of kvm_iommu_put_pages (CVE-2014-3601)"), which fixes the lack of un-pinning for pages intended to be un-pinned (i.e. memory leak) but unfortunately potentially aggravated the number of pages we un-pin that should have stayed pinned. As far as I understand though, the same practical mitigations apply. This issue was found during review of Red Hat 6.6 patches to prepare Ksplice rebootless updates. Thanks to Vegard for his time on a late Friday evening to help me in understanding this code. Fixes: 350b8bd ("kvm: iommu: fix the third parameter of... (CVE-2014-3601)") Cc: stable@vger.kernel.org Signed-off-by: Quentin Casasnovas Signed-off-by: Vegard Nossum Signed-off-by: Jamie Iles Reviewed-by: Sasha Levin Signed-off-by: Paolo Bonzini --- virt/kvm/iommu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c index e51d9f9b995f..c1e6ae989a43 100644 --- a/virt/kvm/iommu.c +++ b/virt/kvm/iommu.c @@ -43,13 +43,13 @@ static void kvm_iommu_put_pages(struct kvm *kvm, gfn_t base_gfn, unsigned long npages); static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, - unsigned long size) + unsigned long npages) { gfn_t end_gfn; pfn_t pfn; pfn = gfn_to_pfn_memslot(slot, gfn); - end_gfn = gfn + (size >> PAGE_SHIFT); + end_gfn = gfn + npages; gfn += 1; if (is_error_noslot_pfn(pfn)) @@ -119,7 +119,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) * Pin all pages we are about to map in memory. This is * important because we unmap and unpin in 4kb steps later. */ - pfn = kvm_pin_pages(slot, gfn, page_size); + pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); if (is_error_noslot_pfn(pfn)) { gfn += 1; continue; @@ -131,7 +131,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) if (r) { printk(KERN_ERR "kvm_iommu_map_address:" "iommu failed to map pfn=%llx\n", pfn); - kvm_unpin_pages(kvm, pfn, page_size); + kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); goto unmap_pages; } -- cgit v1.2.3 From 1715d0dcb0d4231983f1580e265c748cd3371f55 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 30 Sep 2014 20:49:18 +0300 Subject: KVM: x86: Wrong assertion on paging_tmpl.h Even after the recent fix, the assertion on paging_tmpl.h is triggered. Apparently, the assertion wants to check that the PAE is always set on long-mode, but does it in incorrect way. Note that the assertion is not enabled unless the code is debugged by defining MMU_DEBUG. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 806d58e3c320..fd49c867b25a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -298,7 +298,7 @@ retry_walk: } #endif walker->max_level = walker->level; - ASSERT(!is_long_mode(vcpu) && is_pae(vcpu)); + ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); accessed_dirty = PT_GUEST_ACCESSED_MASK; pt_access = pte_access = ACC_ALL; -- cgit v1.2.3 From 571ee1b6859869a09ed718d390aac2b9414646a2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 9 Oct 2014 18:30:08 +0800 Subject: kvm: vfio: fix unregister kvm_device_ops of vfio After commit 80ce163 (KVM: VFIO: register kvm_device_ops dynamically), kvm_device_ops of vfio can be registered dynamically. Commit 3c3c29fd (kvm-vfio: do not use module_init) move the dynamic register invoked by kvm_init in order to fix broke unloading of the kvm module. However, kvm_device_ops of vfio is unregistered after rmmod kvm-intel module which lead to device type collision detection warning after kvm-intel module reinsmod. WARNING: CPU: 1 PID: 10358 at /root/cathy/kvm/arch/x86/kvm/../../../virt/kvm/kvm_main.c:3289 kvm_init+0x234/0x282 [kvm]() Modules linked in: kvm_intel(O+) kvm(O) nfsv3 nfs_acl auth_rpcgss oid_registry nfsv4 dns_resolver nfs fscache lockd sunrpc pci_stub bridge stp llc autofs4 8021q cpufreq_ondemand ipv6 joydev microcode pcspkr igb i2c_algo_bit ehci_pci ehci_hcd e1000e i2c_i801 ixgbe ptp pps_core hwmon mdio tpm_tis tpm ipmi_si ipmi_msghandler acpi_cpufreq isci libsas scsi_transport_sas button dm_mirror dm_region_hash dm_log dm_mod [last unloaded: kvm_intel] CPU: 1 PID: 10358 Comm: insmod Tainted: G W O 3.17.0-rc1 #2 Hardware name: Intel Corporation S2600CP/S2600CP, BIOS RMLSDP.86I.00.29.D696.1311111329 11/11/2013 0000000000000cd9 ffff880ff08cfd18 ffffffff814a61d9 0000000000000cd9 0000000000000000 ffff880ff08cfd58 ffffffff810417b7 ffff880ff08cfd48 ffffffffa045bcac ffffffffa049c420 0000000000000040 00000000000000ff Call Trace: [] dump_stack+0x49/0x60 [] warn_slowpath_common+0x7c/0x96 [] ? kvm_init+0x234/0x282 [kvm] [] warn_slowpath_null+0x15/0x17 [] kvm_init+0x234/0x282 [kvm] [] vmx_init+0x1bf/0x42a [kvm_intel] [] ? vmx_check_processor_compat+0x64/0x64 [kvm_intel] [] do_one_initcall+0xe3/0x170 [] ? __vunmap+0xad/0xb8 [] do_init_module+0x2b/0x174 [] load_module+0x43e/0x569 [] ? do_init_module+0x174/0x174 [] ? copy_module_from_user+0x39/0x82 [] ? module_sect_show+0x20/0x20 [] SyS_init_module+0x54/0x81 [] system_call_fastpath+0x16/0x1b ---[ end trace 0626f4a3ddea56f3 ]--- The bug can be reproduced by: rmmod kvm_intel.ko insmod kvm_intel.ko without rmmod/insmod kvm.ko This patch fixes the bug by unregistering kvm_device_ops of vfio when the kvm-intel module is removed. Reported-by: Liu Rongrong Fixes: 3c3c29fd0d7cddc32862c350d0700ce69953e3bd Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 7 +++++++ virt/kvm/vfio.c | 5 +++++ virt/kvm/vfio.h | 4 ++++ 4 files changed, 17 insertions(+) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 28be31f49250..ea53b04993f2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1080,6 +1080,7 @@ void kvm_device_get(struct kvm_device *dev); void kvm_device_put(struct kvm_device *dev); struct kvm_device *kvm_device_from_filp(struct file *filp); int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); +void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; extern struct kvm_device_ops kvm_xics_ops; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 384eaa7b02fa..25ffac9e947d 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2354,6 +2354,12 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type) return 0; } +void kvm_unregister_device_ops(u32 type) +{ + if (kvm_device_ops_table[type] != NULL) + kvm_device_ops_table[type] = NULL; +} + static int kvm_ioctl_create_device(struct kvm *kvm, struct kvm_create_device *cd) { @@ -3328,5 +3334,6 @@ void kvm_exit(void) kvm_arch_exit(); kvm_irqfd_exit(); free_cpumask_var(cpus_hardware_enabled); + kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 281e7cf2b8e5..620e37f741b8 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -283,3 +283,8 @@ int kvm_vfio_ops_init(void) { return kvm_register_device_ops(&kvm_vfio_ops, KVM_DEV_TYPE_VFIO); } + +void kvm_vfio_ops_exit(void) +{ + kvm_unregister_device_ops(KVM_DEV_TYPE_VFIO); +} diff --git a/virt/kvm/vfio.h b/virt/kvm/vfio.h index 92eac75d6b62..ab88c7dc0514 100644 --- a/virt/kvm/vfio.h +++ b/virt/kvm/vfio.h @@ -3,11 +3,15 @@ #ifdef CONFIG_KVM_VFIO int kvm_vfio_ops_init(void); +void kvm_vfio_ops_exit(void); #else static inline int kvm_vfio_ops_init(void) { return 0; } +static inline void kvm_vfio_ops_exit(void) +{ +} #endif #endif -- cgit v1.2.3 From 4846f1181635e1f68aed0725f71700c15c2fb449 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Fri, 24 Oct 2014 12:23:26 +0200 Subject: MIPS: SEAD3: Fix I2C device registration. This isn't a module and shouldn't be one. Signed-off-by: Ralf Baechle --- arch/mips/mti-sead3/sead3-i2c.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/mips/mti-sead3/sead3-i2c.c b/arch/mips/mti-sead3/sead3-i2c.c index f70d5fc58ef5..795ae83894e0 100644 --- a/arch/mips/mti-sead3/sead3-i2c.c +++ b/arch/mips/mti-sead3/sead3-i2c.c @@ -5,10 +5,8 @@ * * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. */ -#include #include #include -#include struct resource sead3_i2c_resources[] = { { @@ -30,8 +28,4 @@ static int __init sead3_i2c_init(void) return platform_device_register(&sead3_i2c_device); } -module_init(sead3_i2c_init); - -MODULE_AUTHOR("Chris Dearman "); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("I2C probe driver for SEAD3"); +device_initcall(sead3_i2c_init); -- cgit v1.2.3 From 92980405f3537136b8e81007a9df576762f49bbb Mon Sep 17 00:00:00 2001 From: Arun Chandran Date: Fri, 10 Oct 2014 12:31:24 +0100 Subject: arm64: ASLR: Don't randomise text when randomise_va_space == 0 When user asks to turn off ASLR by writing "0" to /proc/sys/kernel/randomize_va_space there should not be any randomization to mmap base, stack, VDSO, libs, text and heap Currently arm64 violates this behavior by randomising text. Fix this by defining a constant ELF_ET_DYN_BASE. The randomisation of mm->mmap_base is done by setup_new_exec -> arch_pick_mmap_layout -> mmap_base -> mmap_rnd. Signed-off-by: Arun Chandran Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 1 + arch/arm64/include/asm/elf.h | 4 ++-- arch/arm64/kernel/process.c | 5 ----- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b8053be1d803..9532f8d5857e 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1,5 +1,6 @@ config ARM64 def_bool y + select ARCH_BINFMT_ELF_RANDOMIZE_PIE select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 01d3aab64b79..1f65be393139 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -126,7 +126,7 @@ typedef struct user_fpsimd_state elf_fpregset_t; * that it will "exec", and that there is sufficient room for the brk. */ extern unsigned long randomize_et_dyn(unsigned long base); -#define ELF_ET_DYN_BASE (randomize_et_dyn(2 * TASK_SIZE_64 / 3)) +#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) /* * When the program starts, a1 contains a pointer to a function to be @@ -169,7 +169,7 @@ extern unsigned long arch_randomize_brk(struct mm_struct *mm); #define COMPAT_ELF_PLATFORM ("v8l") #endif -#define COMPAT_ELF_ET_DYN_BASE (randomize_et_dyn(2 * TASK_SIZE_32 / 3)) +#define COMPAT_ELF_ET_DYN_BASE (2 * TASK_SIZE_32 / 3) /* AArch32 registers. */ #define COMPAT_ELF_NGREG 18 diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index c3065dbc4fa2..fde9923af859 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -378,8 +378,3 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) { return randomize_base(mm->brk); } - -unsigned long randomize_et_dyn(unsigned long base) -{ - return randomize_base(base); -} -- cgit v1.2.3 From ef3e035c3a9b81da8a778bc333d10637acf6c199 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 23 Oct 2014 12:58:13 -0700 Subject: sparc64: Fix register corruption in top-most kernel stack frame during boot. Meelis Roos reported that kernels built with gcc-4.9 do not boot, we eventually narrowed this down to only impacting machines using UltraSPARC-III and derivitive cpus. The crash happens right when the first user process is spawned: [ 54.451346] Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000004 [ 54.451346] [ 54.571516] CPU: 1 PID: 1 Comm: init Not tainted 3.16.0-rc2-00211-gd7933ab #96 [ 54.666431] Call Trace: [ 54.698453] [0000000000762f8c] panic+0xb0/0x224 [ 54.759071] [000000000045cf68] do_exit+0x948/0x960 [ 54.823123] [000000000042cbc0] fault_in_user_windows+0xe0/0x100 [ 54.902036] [0000000000404ad0] __handle_user_windows+0x0/0x10 [ 54.978662] Press Stop-A (L1-A) to return to the boot prom [ 55.050713] ---[ end Kernel panic - not syncing: Attempted to kill init! exitcode=0x00000004 Further investigation showed that compiling only per_cpu_patch() with an older compiler fixes the boot. Detailed analysis showed that the function is not being miscompiled by gcc-4.9, but it is using a different register allocation ordering. With the gcc-4.9 compiled function, something during the code patching causes some of the %i* input registers to get corrupted. Perhaps we have a TLB miss path into the firmware that is deep enough to cause a register window spill and subsequent restore when we get back from the TLB miss trap. Let's plug this up by doing two things: 1) Stop using the firmware stack for client interface calls into the firmware. Just use the kernel's stack. 2) As soon as we can, call into a new function "start_early_boot()" to put a one-register-window buffer between the firmware's deepest stack frame and the top-most initial kernel one. Reported-by: Meelis Roos Tested-by: Meelis Roos Signed-off-by: David S. Miller --- arch/sparc/include/asm/oplib_64.h | 3 ++- arch/sparc/include/asm/setup.h | 2 ++ arch/sparc/kernel/entry.h | 3 --- arch/sparc/kernel/head_64.S | 40 ++++----------------------------------- arch/sparc/kernel/hvtramp.S | 1 - arch/sparc/kernel/setup_64.c | 28 +++++++++++++++++++-------- arch/sparc/kernel/trampoline_64.S | 12 +++++++----- arch/sparc/prom/cif.S | 5 ++--- arch/sparc/prom/init_64.c | 6 +++--- arch/sparc/prom/p1275.c | 2 -- 10 files changed, 40 insertions(+), 62 deletions(-) diff --git a/arch/sparc/include/asm/oplib_64.h b/arch/sparc/include/asm/oplib_64.h index f34682430fcf..2e3a4add8591 100644 --- a/arch/sparc/include/asm/oplib_64.h +++ b/arch/sparc/include/asm/oplib_64.h @@ -62,7 +62,8 @@ struct linux_mem_p1275 { /* You must call prom_init() before using any of the library services, * preferably as early as possible. Pass it the romvec pointer. */ -void prom_init(void *cif_handler, void *cif_stack); +void prom_init(void *cif_handler); +void prom_init_report(void); /* Boot argument acquisition, returns the boot command line string. */ char *prom_getbootargs(void); diff --git a/arch/sparc/include/asm/setup.h b/arch/sparc/include/asm/setup.h index f5fffd84d0dd..29d64b1758ed 100644 --- a/arch/sparc/include/asm/setup.h +++ b/arch/sparc/include/asm/setup.h @@ -48,6 +48,8 @@ unsigned long safe_compute_effective_address(struct pt_regs *, unsigned int); #endif #ifdef CONFIG_SPARC64 +void __init start_early_boot(void); + /* unaligned_64.c */ int handle_ldf_stq(u32 insn, struct pt_regs *regs); void handle_ld_nf(u32 insn, struct pt_regs *regs); diff --git a/arch/sparc/kernel/entry.h b/arch/sparc/kernel/entry.h index ebaba6167dd4..88d322b67fac 100644 --- a/arch/sparc/kernel/entry.h +++ b/arch/sparc/kernel/entry.h @@ -65,13 +65,10 @@ struct pause_patch_entry { extern struct pause_patch_entry __pause_3insn_patch, __pause_3insn_patch_end; -void __init per_cpu_patch(void); void sun4v_patch_1insn_range(struct sun4v_1insn_patch_entry *, struct sun4v_1insn_patch_entry *); void sun4v_patch_2insn_range(struct sun4v_2insn_patch_entry *, struct sun4v_2insn_patch_entry *); -void __init sun4v_patch(void); -void __init boot_cpu_id_too_large(int cpu); extern unsigned int dcache_parity_tl1_occurred; extern unsigned int icache_parity_tl1_occurred; diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S index 4fdeb8040d4d..3d61fcae7ee3 100644 --- a/arch/sparc/kernel/head_64.S +++ b/arch/sparc/kernel/head_64.S @@ -672,14 +672,12 @@ tlb_fixup_done: sethi %hi(init_thread_union), %g6 or %g6, %lo(init_thread_union), %g6 ldx [%g6 + TI_TASK], %g4 - mov %sp, %l6 wr %g0, ASI_P, %asi mov 1, %g1 sllx %g1, THREAD_SHIFT, %g1 sub %g1, (STACKFRAME_SZ + STACK_BIAS), %g1 add %g6, %g1, %sp - mov 0, %fp /* Set per-cpu pointer initially to zero, this makes * the boot-cpu use the in-kernel-image per-cpu areas @@ -706,44 +704,14 @@ tlb_fixup_done: nop #endif - mov %l6, %o1 ! OpenPROM stack call prom_init mov %l7, %o0 ! OpenPROM cif handler - /* Initialize current_thread_info()->cpu as early as possible. - * In order to do that accurately we have to patch up the get_cpuid() - * assembler sequences. And that, in turn, requires that we know - * if we are on a Starfire box or not. While we're here, patch up - * the sun4v sequences as well. + /* To create a one-register-window buffer between the kernel's + * initial stack and the last stack frame we use from the firmware, + * do the rest of the boot from a C helper function. */ - call check_if_starfire - nop - call per_cpu_patch - nop - call sun4v_patch - nop - -#ifdef CONFIG_SMP - call hard_smp_processor_id - nop - cmp %o0, NR_CPUS - blu,pt %xcc, 1f - nop - call boot_cpu_id_too_large - nop - /* Not reached... */ - -1: -#else - mov 0, %o0 -#endif - sth %o0, [%g6 + TI_CPU] - - call prom_init_report - nop - - /* Off we go.... */ - call start_kernel + call start_early_boot nop /* Not reached... */ diff --git a/arch/sparc/kernel/hvtramp.S b/arch/sparc/kernel/hvtramp.S index b7ddcdd1dea9..cdbfec299f2f 100644 --- a/arch/sparc/kernel/hvtramp.S +++ b/arch/sparc/kernel/hvtramp.S @@ -109,7 +109,6 @@ hv_cpu_startup: sllx %g5, THREAD_SHIFT, %g5 sub %g5, (STACKFRAME_SZ + STACK_BIAS), %g5 add %g6, %g5, %sp - mov 0, %fp call init_irqwork_curcpu nop diff --git a/arch/sparc/kernel/setup_64.c b/arch/sparc/kernel/setup_64.c index e629b8377587..c38d19fc27ba 100644 --- a/arch/sparc/kernel/setup_64.c +++ b/arch/sparc/kernel/setup_64.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -162,7 +163,7 @@ char reboot_command[COMMAND_LINE_SIZE]; static struct pt_regs fake_swapper_regs = { { 0, }, 0, 0, 0, 0 }; -void __init per_cpu_patch(void) +static void __init per_cpu_patch(void) { struct cpuid_patch_entry *p; unsigned long ver; @@ -254,7 +255,7 @@ void sun4v_patch_2insn_range(struct sun4v_2insn_patch_entry *start, } } -void __init sun4v_patch(void) +static void __init sun4v_patch(void) { extern void sun4v_hvapi_init(void); @@ -323,14 +324,25 @@ static void __init pause_patch(void) } } -#ifdef CONFIG_SMP -void __init boot_cpu_id_too_large(int cpu) +void __init start_early_boot(void) { - prom_printf("Serious problem, boot cpu id (%d) >= NR_CPUS (%d)\n", - cpu, NR_CPUS); - prom_halt(); + int cpu; + + check_if_starfire(); + per_cpu_patch(); + sun4v_patch(); + + cpu = hard_smp_processor_id(); + if (cpu >= NR_CPUS) { + prom_printf("Serious problem, boot cpu id (%d) >= NR_CPUS (%d)\n", + cpu, NR_CPUS); + prom_halt(); + } + current_thread_info()->cpu = cpu; + + prom_init_report(); + start_kernel(); } -#endif /* On Ultra, we support all of the v8 capabilities. */ unsigned long sparc64_elf_hwcap = (HWCAP_SPARC_FLUSH | HWCAP_SPARC_STBAR | diff --git a/arch/sparc/kernel/trampoline_64.S b/arch/sparc/kernel/trampoline_64.S index 737f8cbc7d56..88ede1d53b4c 100644 --- a/arch/sparc/kernel/trampoline_64.S +++ b/arch/sparc/kernel/trampoline_64.S @@ -109,10 +109,13 @@ startup_continue: brnz,pn %g1, 1b nop - sethi %hi(p1275buf), %g2 - or %g2, %lo(p1275buf), %g2 - ldx [%g2 + 0x10], %l2 - add %l2, -(192 + 128), %sp + /* Get onto temporary stack which will be in the locked + * kernel image. + */ + sethi %hi(tramp_stack), %g1 + or %g1, %lo(tramp_stack), %g1 + add %g1, TRAMP_STACK_SIZE, %g1 + sub %g1, STACKFRAME_SZ + STACK_BIAS + 256, %sp flushw /* Setup the loop variables: @@ -394,7 +397,6 @@ after_lock_tlb: sllx %g5, THREAD_SHIFT, %g5 sub %g5, (STACKFRAME_SZ + STACK_BIAS), %g5 add %g6, %g5, %sp - mov 0, %fp rdpr %pstate, %o1 or %o1, PSTATE_IE, %o1 diff --git a/arch/sparc/prom/cif.S b/arch/sparc/prom/cif.S index 9c86b4b7d429..8050f381f518 100644 --- a/arch/sparc/prom/cif.S +++ b/arch/sparc/prom/cif.S @@ -11,11 +11,10 @@ .text .globl prom_cif_direct prom_cif_direct: + save %sp, -192, %sp sethi %hi(p1275buf), %o1 or %o1, %lo(p1275buf), %o1 - ldx [%o1 + 0x0010], %o2 ! prom_cif_stack - save %o2, -192, %sp - ldx [%i1 + 0x0008], %l2 ! prom_cif_handler + ldx [%o1 + 0x0008], %l2 ! prom_cif_handler mov %g4, %l0 mov %g5, %l1 mov %g6, %l3 diff --git a/arch/sparc/prom/init_64.c b/arch/sparc/prom/init_64.c index d95db755828f..110b0d78b864 100644 --- a/arch/sparc/prom/init_64.c +++ b/arch/sparc/prom/init_64.c @@ -26,13 +26,13 @@ phandle prom_chosen_node; * It gets passed the pointer to the PROM vector. */ -extern void prom_cif_init(void *, void *); +extern void prom_cif_init(void *); -void __init prom_init(void *cif_handler, void *cif_stack) +void __init prom_init(void *cif_handler) { phandle node; - prom_cif_init(cif_handler, cif_stack); + prom_cif_init(cif_handler); prom_chosen_node = prom_finddevice(prom_chosen_path); if (!prom_chosen_node || (s32)prom_chosen_node == -1) diff --git a/arch/sparc/prom/p1275.c b/arch/sparc/prom/p1275.c index b2340f008ae0..545d8bb79b65 100644 --- a/arch/sparc/prom/p1275.c +++ b/arch/sparc/prom/p1275.c @@ -20,7 +20,6 @@ struct { long prom_callback; /* 0x00 */ void (*prom_cif_handler)(long *); /* 0x08 */ - unsigned long prom_cif_stack; /* 0x10 */ } p1275buf; extern void prom_world(int); @@ -52,5 +51,4 @@ void p1275_cmd_direct(unsigned long *args) void prom_cif_init(void *cif_handler, void *cif_stack) { p1275buf.prom_cif_handler = (void (*)(long *))cif_handler; - p1275buf.prom_cif_stack = (unsigned long)cif_stack; } -- cgit v1.2.3 From 06090e8ed89ea2113a236befb41f71d51f100e60 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 24 Oct 2014 09:59:02 -0700 Subject: sparc64: Implement __get_user_pages_fast(). It is not sufficient to only implement get_user_pages_fast(), you must also implement the atomic version __get_user_pages_fast() otherwise you end up using the weak symbol fallback implementation which simply returns zero. This is dangerous, because it causes the futex code to loop forever if transparent hugepages are supported (see get_futex_key()). Signed-off-by: David S. Miller --- arch/sparc/mm/gup.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 1aed0432c64b..ae6ce383d4df 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -160,6 +160,36 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next, flags; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + break; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { -- cgit v1.2.3 From 3dec0fe48a8936528aae2fc3f904c2c9a34ba368 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 24 Oct 2014 18:16:47 +0100 Subject: arm64: Fix memblock current_limit with 64K pages and 48-bit VA With 48-bit VA space, the 64K page configuration uses 3 levels instead of 2 and PUD_SIZE != PMD_SIZE. Since with 64K pages we only cover PMD_SIZE with the initial swapper_pg_dir populated in head.S, the memblock current_limit needs to be set accordingly in map_mem() to avoid allocating unmapped memory. The memblock current_limit is progressively increased as more blocks are mapped. Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 6894ef3e6234..0bf90d26e745 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -297,11 +297,15 @@ static void __init map_mem(void) * create_mapping requires puds, pmds and ptes to be allocated from * memory addressable from the initial direct kernel mapping. * - * The initial direct kernel mapping, located at swapper_pg_dir, - * gives us PUD_SIZE memory starting from PHYS_OFFSET (which must be - * aligned to 2MB as per Documentation/arm64/booting.txt). + * The initial direct kernel mapping, located at swapper_pg_dir, gives + * us PUD_SIZE (4K pages) or PMD_SIZE (64K pages) memory starting from + * PHYS_OFFSET (which must be aligned to 2MB as per + * Documentation/arm64/booting.txt). */ - limit = PHYS_OFFSET + PUD_SIZE; + if (IS_ENABLED(CONFIG_ARM64_64K_PAGES)) + limit = PHYS_OFFSET + PMD_SIZE; + else + limit = PHYS_OFFSET + PUD_SIZE; memblock_set_current_limit(limit); /* map all the memory banks */ -- cgit v1.2.3 From 3d268c9b136f51385f9d041f3f2424501b257388 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Oct 2014 22:56:05 -0400 Subject: overlayfs: don't hold ->i_mutex over opening the real directory just use it to serialize the assignment Signed-off-by: Al Viro --- fs/overlayfs/readdir.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index c6787f84ece9..b7d9fb098840 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -458,20 +458,27 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) { struct inode *inode = file_inode(file); - mutex_lock(&inode->i_mutex); realfile = od->upperfile; if (!realfile) { struct path upperpath; ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); - if (IS_ERR(realfile)) { - mutex_unlock(&inode->i_mutex); - return PTR_ERR(realfile); + mutex_lock(&inode->i_mutex); + if (!od->upperfile) { + if (IS_ERR(realfile)) { + mutex_unlock(&inode->i_mutex); + return PTR_ERR(realfile); + } + od->upperfile = realfile; + } else { + /* somebody has beaten us to it */ + if (!IS_ERR(realfile)) + fput(realfile); + realfile = od->upperfile; } - od->upperfile = realfile; + mutex_unlock(&inode->i_mutex); } - mutex_unlock(&inode->i_mutex); } return vfs_fsync_range(realfile, start, end, datasync); -- cgit v1.2.3 From 68bf8611076a8e4bee8bc8d03ff28bd1e9a9c631 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Oct 2014 22:58:56 -0400 Subject: overlayfs: make ovl_cache_entry->name an array instead of pointer Signed-off-by: Al Viro --- fs/overlayfs/readdir.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index b7d9fb098840..9c9872be2c72 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -18,13 +18,13 @@ #include "overlayfs.h" struct ovl_cache_entry { - const char *name; unsigned int len; unsigned int type; u64 ino; bool is_whiteout; struct list_head l_node; struct rb_node node; + char name[]; }; struct ovl_dir_cache { @@ -82,13 +82,12 @@ static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; + size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); - p = kmalloc(sizeof(*p) + len + 1, GFP_KERNEL); + p = kmalloc(size, GFP_KERNEL); if (p) { - char *name_copy = (char *) (p + 1); - memcpy(name_copy, name, len); - name_copy[len] = '\0'; - p->name = name_copy; + memcpy(p->name, name, len); + p->name[len] = '\0'; p->len = len; p->type = d_type; p->ino = ino; -- cgit v1.2.3 From 49be4fb9cc3431fc4ebc71c764db848483b2a16c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Oct 2014 23:00:53 -0400 Subject: overlayfs: embed root into overlay_readdir_data no sense having it a pointer - all instances have it pointing to local variable in the same stack frame Signed-off-by: Al Viro --- fs/overlayfs/readdir.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 9c9872be2c72..a9ee2c1176fe 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -36,7 +36,7 @@ struct ovl_dir_cache { struct ovl_readdir_data { struct dir_context ctx; bool is_merge; - struct rb_root *root; + struct rb_root root; struct list_head *list; struct list_head *middle; int count; @@ -101,7 +101,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { - struct rb_node **newp = &rdd->root->rb_node; + struct rb_node **newp = &rdd->root.rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; @@ -126,7 +126,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); - rb_insert_color(&p->node, rdd->root); + rb_insert_color(&p->node, &rdd->root); return 0; } @@ -137,7 +137,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd, { struct ovl_cache_entry *p; - p = ovl_cache_entry_find(rdd->root, name, namelen); + p = ovl_cache_entry_find(&rdd->root, name, namelen); if (p) { list_move_tail(&p->l_node, rdd->middle); } else { @@ -277,12 +277,11 @@ static inline int ovl_dir_read_merged(struct path *upperpath, struct list_head *list) { int err; - struct rb_root root = RB_ROOT; struct list_head middle; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .list = list, - .root = &root, + .root = RB_ROOT, .is_merge = false, }; -- cgit v1.2.3 From db6ec212b53abc29a5bb6ac8c810010fc28d5191 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Oct 2014 23:03:03 -0400 Subject: overlayfs: embed middle into overlay_readdir_data same story... Signed-off-by: Al Viro --- fs/overlayfs/readdir.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index a9ee2c1176fe..910553f37aca 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -38,7 +38,7 @@ struct ovl_readdir_data { bool is_merge; struct rb_root root; struct list_head *list; - struct list_head *middle; + struct list_head middle; int count; int err; }; @@ -139,13 +139,13 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd, p = ovl_cache_entry_find(&rdd->root, name, namelen); if (p) { - list_move_tail(&p->l_node, rdd->middle); + list_move_tail(&p->l_node, &rdd->middle); } else { p = ovl_cache_entry_new(name, namelen, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else - list_add_tail(&p->l_node, rdd->middle); + list_add_tail(&p->l_node, &rdd->middle); } return rdd->err; @@ -277,7 +277,6 @@ static inline int ovl_dir_read_merged(struct path *upperpath, struct list_head *list) { int err; - struct list_head middle; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .list = list, @@ -301,11 +300,10 @@ static inline int ovl_dir_read_merged(struct path *upperpath, * Insert lowerpath entries before upperpath ones, this allows * offsets to be reasonably constant */ - list_add(&middle, rdd.list); - rdd.middle = &middle; + list_add(&rdd.middle, rdd.list); rdd.is_merge = true; err = ovl_dir_read(lowerpath, &rdd); - list_del(&middle); + list_del(&rdd.middle); } out: return err; -- cgit v1.2.3 From cac7f2429872d3733dc3f9915857b1691da2eb2f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 26 Oct 2014 16:48:41 -0700 Subject: Linux 3.18-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 05d67af376c5..52c129725270 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 3 PATCHLEVEL = 18 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Shuffling Zombie Juror # *DOCUMENTATION* -- cgit v1.2.3 From 49dc71268bec9a6a28107ba64d834460e1513909 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 27 Oct 2014 09:00:21 -0700 Subject: ARM: omap2plus_defconfig: Fix bloat caused by having ipv6 built-in Commit 673ce00c5d6c (ARM: omap2plus_defconfig: Add support for distros with systemd) caused considerable bloat as noted by Paul Walmsley . Let's fix this issue by making what we can into loadable modules for the systemd options. That's only IPV6 and AUTOFS4_FS it seems, and IPv6 defaults to a loadable module. Reported-by: Paul Walmsley Signed-off-by: Tony Lindgren --- arch/arm/configs/omap2plus_defconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 16e719c268dd..3fbad5b95619 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -86,7 +86,6 @@ CONFIG_IP_PNP_DHCP=y CONFIG_IP_PNP_BOOTP=y CONFIG_IP_PNP_RARP=y # CONFIG_INET_LRO is not set -CONFIG_IPV6=y CONFIG_NETFILTER=y CONFIG_CAN=m CONFIG_CAN_C_CAN=m @@ -317,7 +316,7 @@ CONFIG_EXT4_FS=y CONFIG_FANOTIFY=y CONFIG_QUOTA=y CONFIG_QFMT_V2=y -CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS4_FS=m CONFIG_MSDOS_FS=y CONFIG_VFAT_FS=y CONFIG_TMPFS=y -- cgit v1.2.3 From 739fd5078493f9ea96b7562f504fcc1316d13164 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 27 Oct 2014 13:06:07 -0700 Subject: ARM: omap2plus_defconfig: Fix errors with NAND BCH Looks like we need to have BCH enabled to get NAND working and to avoid getting: nand: error: CONFIG_MTD_NAND_ECC_BCH not enabled Signed-off-by: Tony Lindgren --- arch/arm/configs/omap2plus_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 3fbad5b95619..b3f86670d2eb 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -111,6 +111,7 @@ CONFIG_MTD_OOPS=y CONFIG_MTD_CFI=y CONFIG_MTD_CFI_INTELEXT=y CONFIG_MTD_NAND=y +CONFIG_MTD_NAND_ECC_BCH=y CONFIG_MTD_NAND_OMAP2=y CONFIG_MTD_ONENAND=y CONFIG_MTD_ONENAND_VERIFY_WRITE=y -- cgit v1.2.3 From 4b91f7f3c8b20e073b7bfc098625b37f99789508 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Mon, 27 Oct 2014 13:05:54 -0700 Subject: ARM: OMAP2+: Warn about deprecated legacy booting mode We're moving omaps to use device tree based booting and already have omap2, omap4, omap5, am335x and am437x booting in device tree only mode. Only omap3 still has legacy booting still around and we really want to make that device tree only. So let's add a warning about deprecated legacy booting so we get people to upgrade their boards to use device tree based booting and find out about any remaining issues. Note that for most boards we already have the .dts file and those can be booted with without changing the bootloader using the appended DTB mode. Acked-By: Sebastian Reichel Reviewed-by: Aaro Koskinen Reviewed-by: Javier Martinez Canillas Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/omap_device.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/arm/mach-omap2/omap_device.c b/arch/arm/mach-omap2/omap_device.c index d22c30d3ccfa..8c58b71c2727 100644 --- a/arch/arm/mach-omap2/omap_device.c +++ b/arch/arm/mach-omap2/omap_device.c @@ -917,6 +917,10 @@ static int __init omap_device_late_idle(struct device *dev, void *data) static int __init omap_device_late_init(void) { bus_for_each_dev(&platform_bus_type, NULL, NULL, omap_device_late_idle); + + WARN(!of_have_populated_dt(), + "legacy booting deprecated, please update to boot with .dts\n"); + return 0; } omap_late_initcall_sync(omap_device_late_init); -- cgit v1.2.3