summaryrefslogtreecommitdiff
path: root/lib/uwildmat
diff options
context:
space:
mode:
authorDerek Morton <derek.j.morton@intel.com>2016-02-04 12:06:57 +0000
committerDaniel Vetter <daniel.vetter@ffwll.ch>2016-02-17 18:21:12 +0100
commitef724ab2bba7a51113542c0bc9e77d34e8935c60 (patch)
tree5ce7c5095e9a4679e0966f2c71beef18e03efc9c /lib/uwildmat
parent0a45d6ba1371bb879f55d9f75d893552c2f71c97 (diff)
lib/igt_core.c: Expand --run-subtest functionality.
Added extended wildcard support when specifying --run-subtest. Wildcard format is as specified in rfc3977 and the uwildmat() implementation is taken from libinn. See https://tools.ietf.org/html/rfc3977#section-4 for a description of allowed wildcard expressions. v2: Use comma as list separator (Ville Syrjala) support both ^ and ! as not operators (Dave Gordon) v3: Updated to use uwildmat() (Dave Gordon) Signed-off-by: Derek Morton <derek.j.morton@intel.com> [danvet: Fixup whitespace. Add #include <stdint.h>. Run lint.] Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Diffstat (limited to 'lib/uwildmat')
-rw-r--r--lib/uwildmat/uwildmat.c475
-rw-r--r--lib/uwildmat/uwildmat.h24
2 files changed, 499 insertions, 0 deletions
diff --git a/lib/uwildmat/uwildmat.c b/lib/uwildmat/uwildmat.c
new file mode 100644
index 00000000..fac2319e
--- /dev/null
+++ b/lib/uwildmat/uwildmat.c
@@ -0,0 +1,475 @@
+/* uwildmat.c is reused from libinn - https://launchpad.net/ubuntu/+source/inn2/2.5.4-1
+
+ This provides wild card matching originally used in InterNetNews and is
+ described in https://tools.ietf.org/html/rfc3977#section-4
+
+ INN licence:
+ INN as a whole and all code contained in it not otherwise marked with
+ different licenses and/or copyrights is covered by the following copyright
+ and license:
+
+ Copyright (c) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012,
+ 2013, 2014 by Internet Systems Consortium, Inc. ("ISC")
+ Copyright (c) 1991, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
+ 2002, 2003 by The Internet Software Consortium and Rich Salz
+
+ This code is derived from software contributed to the Internet Software
+ Consortium by Rich Salz.
+
+ Permission to use, copy, modify, and distribute this software for any
+ purpose with or without fee is hereby granted, provided that the above
+ copyright notice and this permission notice appear in all copies.
+
+ THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
+ REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY
+ SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+*/
+
+/* $Id: uwildmat.c 8918 2010-01-22 23:28:28Z iulius $
+ **
+ ** wildmat pattern matching with Unicode UTF-8 extensions.
+ **
+ ** Do shell-style pattern matching for ?, \, [], and * characters. Might not
+ ** be robust in face of malformed patterns; e.g., "foo[a-" could cause a
+ ** segmentation violation. It is 8-bit clean. (Robustness hopefully fixed
+ ** July 2000; all malformed patterns should now just fail to match anything.)
+ **
+ ** Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
+ ** Rich $alz is now <rsalz@osf.org>.
+ **
+ ** April, 1991: Replaced mutually-recursive calls with in-line code for the
+ ** star character.
+ **
+ ** Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code.
+ ** This can greatly speed up failing wildcard patterns. For example:
+ **
+ ** pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*
+ ** text 1: -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1
+ ** text 2: -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1
+ **
+ ** Text 1 matches with 51 calls, while text 2 fails with 54 calls. Without
+ ** the ABORT code, it takes 22310 calls to fail. Ugh. The following
+ ** explanation is from Lars:
+ **
+ ** The precondition that must be fulfilled is that DoMatch will consume at
+ ** least one character in text. This is true if *p is neither '*' nor '\0'.)
+ ** The last return has ABORT instead of false to avoid quadratic behaviour in
+ ** cases like pattern "*a*b*c*d" with text "abcxxxxx". With false, each
+ ** star-loop has to run to the end of the text; with ABORT only the last one
+ ** does.
+ **
+ ** Once the control of one instance of DoMatch enters the star-loop, that
+ ** instance will return either true or ABORT, and any calling instance will
+ ** therefore return immediately after (without calling recursively again).
+ ** In effect, only one star-loop is ever active. It would be possible to
+ ** modify the code to maintain this context explicitly, eliminating all
+ ** recursive calls at the cost of some complication and loss of clarity (and
+ ** the ABORT stuff seems to be unclear enough by itself). I think it would
+ ** be unwise to try to get this into a released version unless you have a
+ ** good test data base to try it out on.
+ **
+ ** June, 1991: Robert Elz <kre@munnari.oz.au> added minus and close bracket
+ ** handling for character sets.
+ **
+ ** July, 2000: Largely rewritten by Russ Allbery <rra@stanford.edu> to add
+ ** support for ',', '!', and optionally '@' to the core wildmat routine.
+ ** Broke the character class matching into a separate function for clarity
+ ** since it's infrequently used in practice, and added some simple lookahead
+ ** to significantly decrease the recursive calls in the '*' matching code.
+ ** Added support for UTF-8 as the default character set for any high-bit
+ ** characters.
+ **
+ ** For more information on UTF-8, see RFC 3629.
+ **
+ ** Please note that this file is intentionally written so that conditionally
+ ** executed expressions are on separate lines from the condition to
+ ** facilitate analysis of the coverage of the test suite using purecov.
+ ** Please preserve this. As of March 11, 2001, purecov reports that the
+ ** accompanying test suite achieves 100% coverage of this file.
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include "uwildmat.h"
+
+#define ABORT -1
+
+/* Whether or not an octet looks like the start of a UTF-8 character. */
+#define ISUTF8(c) (((c) & 0xc0) == 0xc0)
+
+
+/*
+ ** Determine the length of a non-ASCII character in octets (for advancing
+ ** pointers when skipping over characters). Takes a pointer to the start of
+ ** the character and to the last octet of the string. If end is NULL, expect
+ ** the string pointed to by start to be nul-terminated. If the character is
+ ** malformed UTF-8, return 1 to treat it like an eight-bit local character.
+ */
+static int
+utf8_length(const unsigned char *start, const unsigned char *end)
+{
+ unsigned char mask = 0x80;
+ const unsigned char *p;
+ int length = 0;
+ int left;
+
+ for (; mask > 0 && (*start & mask) == mask; mask >>= 1)
+ length++;
+ if (length < 2 || length > 6)
+ return 1;
+ if (end != NULL && (end - start + 1) < length)
+ return 1;
+ left = length - 1;
+ for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++)
+ left--;
+ return (left == 0) ? length : 1;
+}
+
+
+/*
+ ** Check whether a string contains only valid UTF-8 characters.
+ */
+bool
+is_valid_utf8(const char *text)
+{
+ unsigned char mask;
+ const unsigned char *p;
+ int length;
+ int left;
+
+ for (p = (const unsigned char *)text; *p != '\0';) {
+ mask = 0x80;
+ length = 0;
+
+ /* Find out the expected length of the character. */
+ for (; mask > 0 && (*p & mask) == mask; mask >>= 1)
+ length++;
+
+ p++;
+
+ /* Valid ASCII. */
+ if (length == 0)
+ continue;
+
+ /* Invalid length. */
+ if (length < 2 || length > 6)
+ return false;
+
+ /* Check that each byte looks like 10xxxxxx, except for the first. */
+ left = length - 1;
+ for (; left > 0 && (*p & 0xc0) == 0x80; p++)
+ left--;
+
+ if (left > 0)
+ return false;
+ }
+
+ return true;
+}
+
+
+/*
+ ** Convert a UTF-8 character to UCS-4. Takes a pointer to the start of the
+ ** character and to the last octet of the string, and to a uint32_t into
+ ** which to put the decoded UCS-4 value. If end is NULL, expect the string
+ ** pointed to by start to be nul-terminated. Returns the number of octets in
+ ** the UTF-8 encoding. If the UTF-8 character is malformed, set result to
+ ** the decimal value of the first octet; this is wrong, but it will generally
+ ** cause the rest of the wildmat matching to do the right thing for non-UTF-8
+ ** input.
+ */
+static int
+utf8_decode(const unsigned char *start, const unsigned char *end,
+ uint32_t *result)
+{
+ uint32_t value = 0;
+ int length, i;
+ const unsigned char *p = start;
+ unsigned char mask;
+
+ length = utf8_length(start, end);
+ if (length < 2) {
+ *result = *start;
+ return 1;
+ }
+ mask = (1 << (7 - length)) - 1;
+ value = *p & mask;
+ p++;
+ for (i = length - 1; i > 0; i--) {
+ value = (value << 6) | (*p & 0x3f);
+ p++;
+ }
+ *result = value;
+ return length;
+}
+
+
+/*
+ ** Match a character class against text, a UCS-4 character. start is a
+ ** pointer to the first character of the character class, end a pointer to
+ ** the last. Returns whether the class matches that character.
+ */
+static bool
+match_class(uint32_t text, const unsigned char *start,
+ const unsigned char *end)
+{
+ bool reversed, allowrange;
+ const unsigned char *p = start;
+ uint32_t first = 0;
+ uint32_t last;
+
+ /* Check for an inverted character class (starting with ^). If the
+ character matches the character class, we return !reversed; that way,
+ we return true if it's a regular character class and false if it's a
+ reversed one. If the character doesn't match, we return reversed. */
+ reversed = (*p == '^');
+ if (reversed)
+ p++;
+
+ /* Walk through the character class until we reach the end or find a
+ match, handling character ranges as we go. Only permit a range to
+ start when allowrange is true; this allows - to be treated like a
+ normal character as the first character of the class and catches
+ malformed ranges like a-e-n. We treat the character at the beginning
+ of a range as both a regular member of the class and the beginning of
+ the range; this is harmless (although it means that malformed ranges
+ like m-a will match m and nothing else). */
+ allowrange = false;
+ while (p <= end) {
+ if (allowrange && *p == '-' && p < end) {
+ p++;
+ p += utf8_decode(p, end, &last);
+ if (text >= first && text <= last)
+ return !reversed;
+ allowrange = false;
+ } else {
+ p += utf8_decode(p, end, &first);
+ if (text == first)
+ return !reversed;
+ allowrange = true;
+ }
+ }
+ return reversed;
+}
+
+
+/*
+ ** Match the text against the pattern between start and end. This is a
+ ** single pattern; a leading ! or @ must already be taken care of, and
+ ** commas must be dealt with outside of this routine.
+ */
+static int
+match_pattern(const unsigned char *text, const unsigned char *start,
+ const unsigned char *end)
+{
+ const unsigned char *q, *endclass;
+ const unsigned char *p = start;
+ bool ismeta;
+ int matched, width;
+ uint32_t c;
+
+ for (; p <= end; p++) {
+ if (!*text && *p != '*')
+ return ABORT;
+
+ switch (*p) {
+ case '\\':
+ if (!*++p)
+ return ABORT;
+ /* Fall through. */
+
+ default:
+ if (*text++ != *p)
+ return false;
+ break;
+
+ case '?':
+ text += ISUTF8(*text) ? utf8_length(text, NULL) : 1;
+ break;
+
+ case '*':
+ /* Consecutive stars are equivalent to one. Advance pattern to
+ the character after the star. */
+ for (++p; *p == '*'; p++)
+ ;
+
+ /* A trailing star will match anything. */
+ if (p > end)
+ return true;
+
+ /* Basic algorithm: Recurse at each point where the * could
+ possibly match. If the match succeeds or aborts, return
+ immediately; otherwise, try the next position.
+
+Optimization: If the character after the * in the pattern
+isn't a metacharacter (the common case), then the * has to
+consume characters at least up to the next occurrence of that
+character in the text. Scan forward for those points rather
+than recursing at every possible point to save the extra
+function call overhead. */
+ ismeta = (*p == '[' || *p == '?' || *p == '\\');
+ while (*text) {
+ width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
+ if (ismeta) {
+ matched = match_pattern(text, p, end);
+ text += width;
+ } else {
+ while (*text && *text != *p) {
+ text += width;
+ width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
+ }
+ if (!*text)
+ return ABORT;
+ matched = match_pattern(++text, p + 1, end);
+ }
+ if (matched != false)
+ return matched;
+ }
+ return ABORT;
+
+ case '[':
+ /* Find the end of the character class, making sure not to pick
+ up a close bracket at the beginning of the class. */
+ p++;
+ q = p + (*p == '^') + 1;
+ if (q > end)
+ return ABORT;
+ endclass = memchr(q, ']', (size_t) (end - q + 1));
+ if (!endclass)
+ return ABORT;
+
+ /* Do the heavy lifting in another function for clarity, since
+ character classes are an uncommon case. */
+ text += utf8_decode(text, NULL, &c);
+ if (!match_class(c, p, endclass - 1))
+ return false;
+ p = endclass;
+ break;
+ }
+ }
+
+ return (*text == '\0');
+}
+
+
+/*
+ ** Takes text and a wildmat expression; a wildmat expression is a
+ ** comma-separated list of wildmat patterns, optionally preceded by ! to
+ ** invert the sense of the expression. Returns UWILDMAT_MATCH if that
+ ** expression matches the text, UWILDMAT_FAIL otherwise. If allowpoison is
+ ** set, allow @ to introduce a poison expression (the same as !, but if it
+ ** triggers the failed match the routine returns UWILDMAT_POISON instead).
+ */
+static enum uwildmat
+match_expression(const unsigned char *text, const unsigned char *start,
+ bool allowpoison)
+{
+ const unsigned char *end, *split;
+ const unsigned char *p = start;
+ bool reverse, escaped;
+ bool match = false;
+ bool poison = false;
+ bool poisoned = false;
+
+ /* Handle the empty expression separately, since otherwise end will be
+ set to an invalid pointer. */
+ if (!*p)
+ return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;
+ end = start + strlen((const char *) start) - 1;
+
+ /* Main match loop. Find each comma that separates patterns, and attempt
+ to match the text with each pattern in order. The last matching
+ pattern determines whether the whole expression matches. */
+ for (; p <= end + 1; p = split + 1) {
+ if (allowpoison)
+ poison = (*p == '@');
+ reverse = (*p == '!') || poison;
+ if (reverse)
+ p++;
+
+ /* Find the first unescaped comma, if any. If there is none, split
+ will be one greater than end and point at the nul at the end of
+ the string. */
+ for (escaped = false, split = p; split <= end; split++) {
+ if (*split == '[') {
+ split++;
+ if (*split == ']')
+ split++;
+ while (split <= end && *split != ']')
+ split++;
+ }
+ if (*split == ',' && !escaped)
+ break;
+ escaped = (*split == '\\') ? !escaped : false;
+ }
+
+ /* Optimization: If match == !reverse and poison == poisoned, this
+ pattern can't change the result, so don't do any work. */
+ if (match == !reverse && poison == poisoned)
+ continue;
+ if (match_pattern(text, p, split - 1) == true) {
+ poisoned = poison;
+ match = !reverse;
+ }
+ }
+ if (poisoned)
+ return UWILDMAT_POISON;
+ return match ? UWILDMAT_MATCH : UWILDMAT_FAIL;
+}
+
+
+/*
+ ** User-level routine used for wildmats where @ should be treated as a
+ ** regular character.
+ */
+bool
+uwildmat(const char *text, const char *pat)
+{
+ const unsigned char *utext = (const unsigned char *) text;
+ const unsigned char *upat = (const unsigned char *) pat;
+
+ if (upat[0] == '*' && upat[1] == '\0')
+ return true;
+ else
+ return (match_expression(utext, upat, false) == UWILDMAT_MATCH);
+}
+
+
+/*
+ ** User-level routine used for wildmats that support poison matches.
+ */
+enum uwildmat
+uwildmat_poison(const char *text, const char *pat)
+{
+ const unsigned char *utext = (const unsigned char *) text;
+ const unsigned char *upat = (const unsigned char *) pat;
+
+ if (upat[0] == '*' && upat[1] == '\0')
+ return UWILDMAT_MATCH;
+ else
+ return match_expression(utext, upat, true);
+}
+
+
+/*
+ ** User-level routine for simple expressions (neither , nor ! are special).
+ */
+bool
+uwildmat_simple(const char *text, const char *pat)
+{
+ const unsigned char *utext = (const unsigned char *) text;
+ const unsigned char *upat = (const unsigned char *) pat;
+ size_t length;
+
+ if (upat[0] == '*' && upat[1] == '\0')
+ return true;
+ else {
+ length = strlen(pat);
+ return (match_pattern(utext, upat, upat + length - 1) == true);
+ }
+}
diff --git a/lib/uwildmat/uwildmat.h b/lib/uwildmat/uwildmat.h
new file mode 100644
index 00000000..2e471891
--- /dev/null
+++ b/lib/uwildmat/uwildmat.h
@@ -0,0 +1,24 @@
+
+#ifndef UWILDMAT_H
+#define UWILDMAT_H 1
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <stdbool.h>
+
+/*
+** WILDMAT MATCHING
+*/
+enum uwildmat {
+ UWILDMAT_FAIL = 0,
+ UWILDMAT_MATCH = 1,
+ UWILDMAT_POISON
+};
+
+extern bool is_valid_utf8(const char *start);
+extern bool uwildmat(const char *text, const char *pat);
+extern bool uwildmat_simple(const char *text, const char *pat);
+extern enum uwildmat uwildmat_poison(const char *text, const char *pat);
+
+
+#endif /* UWILDMAT_H */