From ec7cf149e9063772eb884c3a7d44b65291bee860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kurucz=20Gy=C3=B6rgy?= <kuruczgyurci@hotmail.com> Date: Thu, 4 Oct 2018 00:04:53 +0200 Subject: [PATCH] Modularized the codebase further. Added a specification file. --- .gitignore | 2 + Makefile | 2 +- class.c | 36 ++++++++-- class.h | 8 ++- dfa.c | 197 ++++++++++++++++------------------------------------- main.c | 35 ++-------- nfa.c | 101 +++++++++++++++++++++++++++ nfa.h | 11 +++ parse.c | 8 ++- spec.tex | 102 +++++++++++++++++++++++++++ 10 files changed, 320 insertions(+), 182 deletions(-) create mode 100644 nfa.c create mode 100644 nfa.h create mode 100644 spec.tex diff --git a/.gitignore b/.gitignore index ba2906d..5cf1daf 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ main +spec.* +!spec.tex diff --git a/Makefile b/Makefile index 2c2e527..7bcedbf 100644 --- a/Makefile +++ b/Makefile @@ -1,2 +1,2 @@ all: - gcc -Wall -pedantic -g main.c class.c dfa.c parse.c -o main \ No newline at end of file + gcc -Wall -pedantic -g main.c class.c nfa.c dfa.c parse.c -o main \ No newline at end of file diff --git a/class.c b/class.c index 9f837a1..253409f 100644 --- a/class.c +++ b/class.c @@ -1,16 +1,22 @@ #include "class.h" #include <stdlib.h> +#include <stdio.h> +#include <ctype.h> void class_union(class *a, class b) { - for (int i = 0; i < 4; i++) (a->d)[i] |= (b.d)[i]; + for (int i = 0; i < CLASS_LEN; i++) (a->d)[i] |= (b.d)[i]; +} + +void class_neg(class *a) { + for (int i = 0; i < CLASS_LEN; i++) a->d[i] = ~(a->d[i]); } bool class_any(class c) { - return c.d[0] | c.d[1] | c.d[2] | c.d[3]; + return c.d[0] | c.d[1]; } bool class_eq(class a, class b) { - for (int i = 0; i < 4; i++) if (a.d[i] != b.d[i]) return false; + for (int i = 0; i < CLASS_LEN; i++) if (a.d[i] != b.d[i]) return false; return true; } @@ -18,15 +24,31 @@ bool class_has(class a, char c) { return a.d[c/64] & (1ULL << c%64); } -char* class_str(class c) { - char *res = (char*)malloc(0x101); +char* class_str(class c, int *out_len) { + int bits = 0; + for (int i = 0; i < CLASS_BITS; i++) { + if (c.d[i/64] & (1ULL << i%64)) bits++; + } + + char *res = (char*)malloc(CLASS_BITS*3); int idx = 0; - for (int i = 0; i <= 0xFF; i++) { + if (bits > CLASS_BITS/2) { + class_neg(&c); + res[idx++] = '^'; + } + + for (int i = 0; i < CLASS_BITS; i++) { if (c.d[i/64] & (1ULL << i%64)) { - res[idx++] = (char)i; + char ch = (char)i; + if (iscntrl(ch) || ch == '\\' || ch == '\"') { + idx += sprintf(res+idx, "\\%02X", ch); + } else { + res[idx++] = ch; + } } } res[idx] = '\0'; + *out_len = idx; return res; } diff --git a/class.h b/class.h index 58dddc5..5e8608b 100644 --- a/class.h +++ b/class.h @@ -1,19 +1,21 @@ - #ifndef _CLASS_H_ #define _CLASS_H_ #include <stdbool.h> +#define CLASS_BITS (1 << 7) +#define CLASS_LEN CLASS_BITS/64 typedef struct { - unsigned long long d[4]; + unsigned long long d[CLASS_LEN]; } class; static const class CLASS_NONE = { {0} }; void class_union(class *a, class b); +void class_neg(class *a); bool class_any(class c); bool class_eq(class a, class b); bool class_has(class a, char c); -char* class_str(class c); +char* class_str(class c, int *out_len); class class_char(char c); #endif \ No newline at end of file diff --git a/dfa.c b/dfa.c index 8f873c6..bfe8ace 100644 --- a/dfa.c +++ b/dfa.c @@ -1,4 +1,5 @@ #include "dfa.h" +#include "nfa.h" #include <stdlib.h> #include <string.h> @@ -16,118 +17,15 @@ typedef struct { bool end; } dfa_temp_node; -typedef struct { - class class; - int a, b; -} nfa_node; - -void nfa_class(nfa_node *nfa, int idx, class class) { - nfa[idx] = (nfa_node){ class, idx+1, -1 }; - nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; -} - -void nfa_union(nfa_node *nfa, int idx, int a, int b) { - nfa[idx] = (nfa_node){ CLASS_NONE, a, b }; - nfa[a+1].a = idx+1; - nfa[b+1].a = idx+1; - nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; -} - -void nfa_concat(nfa_node *nfa, int idx, int a, int b) { - nfa[idx] = (nfa_node){ CLASS_NONE, a, -1 }; - nfa[a+1].a = b; - nfa[b+1].a = idx+1; - nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; -} - -void nfa_star(nfa_node *nfa, int idx, int a) { - nfa[idx] = (nfa_node){ CLASS_NONE, a, idx+1 }; - nfa[a+1].a = idx+1; - nfa[a+1].b = a; - nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; -} - -void nfa_plus(nfa_node *nfa, int idx, int a) { - nfa[idx] = (nfa_node){ CLASS_NONE, a, -1 }; - nfa[a+1].a = idx+1; - nfa[a+1].b = a; - nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; -} - -void nfa_questionmark(nfa_node *nfa, int idx, int a) { - nfa[idx] = (nfa_node){ CLASS_NONE, a, idx+1 }; - nfa[a+1].a = idx+1; - nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; -} - -int ast_to_nfa(nfa_node *nfa, ast_node *ast, int *idx, int ast_idx) { - ast_node current = ast[ast_idx]; - if (current.op == '*') { - int a = ast_to_nfa(nfa, ast, idx, current.a); - int r = *idx; - nfa_star(nfa, r, a); - *idx += 2; - return r; - } else if (current.op == '|') { - int a = ast_to_nfa(nfa, ast, idx, current.a); - int b = ast_to_nfa(nfa, ast, idx, current.b); - int r = *idx; - nfa_union(nfa, r, a, b); - *idx += 2; - return r; - } else if (current.op == '&') { - int a = ast_to_nfa(nfa, ast, idx, current.a); - int b = ast_to_nfa(nfa, ast, idx, current.b); - int r = *idx; - nfa_concat(nfa, r, a, b); - *idx += 2; - return r; - } else if (current.op == '+') { - int a = ast_to_nfa(nfa, ast, idx, current.a); - int r = *idx; - nfa_plus(nfa, r, a); - *idx += 2; - return r; - } else if (current.op == '?') { - int a = ast_to_nfa(nfa, ast, idx, current.a); - int r = *idx; - nfa_questionmark(nfa, r, a); - *idx += 2; - return r; - } else { - int r = *idx; - nfa_class(nfa, r, current.class); - *idx += 2; - return r; - } -} - -void nfa_set(nfa_node *nfa, int idx, int *stack, int *vis, class *set) { - int p = 0; - stack[p++] = idx; - while (p > 0) { - idx = stack[--p]; - nfa_node n = nfa[idx]; - if (vis[idx]) continue; - vis[idx] = 1; - if (!class_any(n.class)) { - /* both epsilon transitions */ - if (n.a != -1) stack[p++] = n.a; - if (n.b != -1) stack[p++] = n.b; - } else { - /* n.a 'n.class' transition */ - class_union(&set[n.a], n.class); - } - } -} - dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) { int nfa_begin = nfa_count - 2; int nfa_end = nfa_count - 1; - int *stack =(int*)malloc(sizeof(int)*nfa_count); + //variables used in the nfa_set function + int *stack = (int*)malloc(sizeof(int)*nfa_count); int *vis = (int*)malloc(sizeof(int)*nfa_count); class *class_set = (class*)malloc(sizeof(class)*nfa_count); + dfa_temp_node *dfa_nodes = (dfa_temp_node*)malloc(sizeof(dfa_temp_node)*nfa_count); int dfa_count = 0; int *dfa_stack = (int*)malloc(sizeof(int)*nfa_count); @@ -146,17 +44,16 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) { dfa_count++; } while (p > 0) { - int dfa_idx = dfa_stack[--p]; - dfa_temp_node *current = &dfa_nodes[dfa_idx]; - printf("processing dfa[%d]; set={", dfa_idx); - for (int i = 0; i < current->n_set; i++) { - printf("%d,", current->set[i]); - } + int current_idx = dfa_stack[--p]; + dfa_temp_node *current = &dfa_nodes[current_idx]; + printf("processing dfa[%d]; set={", current_idx); + for (int i = 0; i < current->n_set; i++) printf("%d,", current->set[i]); printf("}\n"); - int **reachable_by_symbol = (int**)malloc(sizeof(int*)*0xFF); - int *symbol_lengths = (int*)malloc(sizeof(int)*0xFF); - char *symbols = (char*)malloc(sizeof(char)*0xFF); - int symbol_count = 0; + + int **edges = (int**)malloc(sizeof(int*)*0xFF); + int *edge_set_lengths = (int*)malloc(sizeof(int)*0xFF); + char *edge_symbols = (char*)malloc(sizeof(char)*0xFF); + int edge_count = 0; for (int i = 0; i < current->n_set; i++) { //reset vectors for (int k = 0; k < nfa_count; k++) { @@ -169,45 +66,44 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) { current->end = current->end || vis[nfa_end]; //calculate all possible symbols class all_classes = CLASS_NONE; - for (int f = 0; f < nfa_count; f++) - class_union(&all_classes, class_set[f]); + for (int f = 0; f < nfa_count; f++) class_union(&all_classes, class_set[f]); //iterate possible symbols - char *all_classes_str = class_str(all_classes); - printf("all possible reachable classes from nfa[%d]: %s\n", nfa_node_idx, all_classes_str); - for (char* ch = all_classes_str; *ch; ch++) { - int symbol = *ch; + for (int ch = 0; ch < CLASS_BITS; ch++) { + int symbol = (char)ch; + if (!class_has(all_classes, symbol)) continue; int symbol_idx = -1; - for (int f = 0; f < symbol_count; f++) { - if (symbols[f] == symbol) { + for (int f = 0; f < edge_count; f++) { + if (edge_symbols[f] == symbol) { symbol_idx = f; break; } } if (symbol_idx == -1) { - reachable_by_symbol[symbol_count] = (int*)malloc(sizeof(int)*nfa_count); - symbol_lengths[symbol_count] = 0; - symbols[symbol_count] = symbol; - symbol_idx = symbol_count++; + edges[edge_count] = (int*)malloc(sizeof(int)*nfa_count); + edge_set_lengths[edge_count] = 0; + edge_symbols[edge_count] = symbol; + symbol_idx = edge_count++; } //calculate reachable nfa node set for this symbol printf("\ttransition set from nfa[%d] for symbol '%c': {", nfa_node_idx, symbol); for (int f = 0; f < nfa_count; f++) { if (class_has(class_set[f], symbol)) { - reachable_by_symbol[symbol_idx][symbol_lengths[symbol_idx]++] = f; + edges[symbol_idx][edge_set_lengths[symbol_idx]++] = f; printf("%d,", f); } } printf("}\n"); } } - int *transitions = (int*)malloc(sizeof(int)*symbol_count); - class *transition_classes = (class*)malloc(sizeof(class)*symbol_count); - for (int i = 0; i < symbol_count; i++) { - qsort(reachable_by_symbol[i], symbol_lengths[i], sizeof(int), int_cmp); + int *transitions = (int*)malloc(sizeof(int)*edge_count); + class *transition_classes = (class*)malloc(sizeof(class)*edge_count); + int reduced_edge_count = 0; + for (int i = 0; i < edge_count; i++) { + qsort(edges[i], edge_set_lengths[i], sizeof(int), int_cmp); int target = -1; for (int k = 0; k < dfa_count; k++) { - if (dfa_nodes[k].n_set != symbol_lengths[i]) continue; - if (memcmp(dfa_nodes[k].set, reachable_by_symbol[i], symbol_lengths[i]) == 0) { //match + if (dfa_nodes[k].n_set != edge_set_lengths[i]) continue; + if (memcmp(dfa_nodes[k].set, edges[i], edge_set_lengths[i]) == 0) { //match target = k; break; } @@ -215,22 +111,40 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) { if (target == -1) { printf("\t\tpushing new dfa_node dfa[%d]\n", dfa_count); dfa_nodes[dfa_count] = (dfa_temp_node){ - symbol_lengths[i], -1, - reachable_by_symbol[i], + edge_set_lengths[i], -1, + edges[i], NULL, NULL, false }; dfa_stack[p++] = dfa_count; target = dfa_count; dfa_count++; } - transitions[i] = target; - transition_classes[i] = class_char(symbols[i]); + int reusable_edge = -1; + for (int k = 0; k < reduced_edge_count; k++) { + if (transitions[k] == target) { + reusable_edge = k; + break; + } + } + class c = class_char(edge_symbols[i]); + if (reusable_edge != -1) { + class_union(&transition_classes[reusable_edge], c); + } else { + transitions[reduced_edge_count] = target; + transition_classes[reduced_edge_count] = c; + reduced_edge_count++; + } } - current->n_transitions = symbol_count; + current->n_transitions = reduced_edge_count; current->transitions = transitions; current->transition_classes = transition_classes; } + free(vis); + free(stack); + free(class_set); + free(dfa_stack); + dfa dfa_obj = { (dfa_node*)malloc(sizeof(dfa_node) * dfa_count), dfa_count, @@ -250,6 +164,9 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) { } dfa_obj.nodes[i] = d; } + + free(dfa_nodes); + return dfa_obj; } diff --git a/main.c b/main.c index 1da149c..0eaf1e6 100644 --- a/main.c +++ b/main.c @@ -5,33 +5,6 @@ #include <stdlib.h> int main(int argc, char **argv) { - // ast_node ast[] = { - // { '*', CLASS_NONE, 1, -1 }, - // { '|', CLASS_NONE, 2, 3 }, - // { '\0', class_char('a'), -1, -1 }, - // { '\0', class_char('b'), -1, -1 } - // }; - // int ast_count = 4; - - // ast_node ast[] = { - // /* 0*/{ '*', CLASS_NONE, 1, -1 }, - // /* 1*/{ '|', CLASS_NONE, 2, 7 }, - // /* 2*/{ '|', CLASS_NONE, 3, 4 }, - // /* 3*/{ '0', class_char('a'), -1, -1 }, - // /* 4*/{ '&', CLASS_NONE, 5, 6 }, - // /* 5*/{ '0', class_char('a'), -1, -1 }, - // /* 6*/{ '0', class_char('b'), -1, -1 }, - // /* 7*/{ '&', CLASS_NONE, 8, 9 }, - // /* 8*/{ '0', class_char('a'), -1, -1 }, - // /* 9*/{ '&', CLASS_NONE, 10, 11 }, - // /*10*/{ '0', class_char('b'), -1, -1 }, - // /*11*/{ '&', CLASS_NONE, 12, 13 }, - // /*12*/{ '0', class_char('c'), -1, -1 }, - // /*13*/{ '*', CLASS_NONE, 14, -1 }, - // /*14*/{ '0', class_char('x'), -1, -1 }, - // }; - //int ast_count = sizeof(ast) / sizeof(ast_node); - if (argc < 2) { printf("No regex argument found!\n"); return 1; @@ -43,7 +16,8 @@ int main(int argc, char **argv) { printf("Parsed regex '%s' [root=%d]:\n", r, ast_root); for (int i = 0; i < ast_count; i++) { ast_node a = parsed_ast[i]; - char* str = class_str(a.class); + int out_len; + char* str = class_str(a.class, &out_len); printf("\tast[%d] = {%c, %s, %d, %d}\n", i, a.op, str, a.a, a.b); free(str); } @@ -56,8 +30,9 @@ int main(int argc, char **argv) { dfa_node node = d.nodes[i]; printf("dfa%c%c[%d] transitions: { ", (i == d.begin ? '>' : ' '), (node.end ? '!' : ' '), i); for (int k = 0; k < node.edges; k++) { - char* str = class_str(node.what[k]); - printf("%s:%d, ", str, node.to[k]); + int out_len; + char* str = class_str(node.what[k], &out_len); + printf("\"%s\":%d, ", str, node.to[k]); free(str); } printf("}\n"); diff --git a/nfa.c b/nfa.c new file mode 100644 index 0000000..74875e8 --- /dev/null +++ b/nfa.c @@ -0,0 +1,101 @@ +#include "nfa.h" + +void nfa_class(nfa_node *nfa, int idx, class class) { + nfa[idx] = (nfa_node){ class, idx+1, -1 }; + nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; +} + +void nfa_union(nfa_node *nfa, int idx, int a, int b) { + nfa[idx] = (nfa_node){ CLASS_NONE, a, b }; + nfa[a+1].a = idx+1; + nfa[b+1].a = idx+1; + nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; +} + +void nfa_concat(nfa_node *nfa, int idx, int a, int b) { + nfa[idx] = (nfa_node){ CLASS_NONE, a, -1 }; + nfa[a+1].a = b; + nfa[b+1].a = idx+1; + nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; +} + +void nfa_star(nfa_node *nfa, int idx, int a) { + nfa[idx] = (nfa_node){ CLASS_NONE, a, idx+1 }; + nfa[a+1].a = idx+1; + nfa[a+1].b = a; + nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; +} + +void nfa_plus(nfa_node *nfa, int idx, int a) { + nfa[idx] = (nfa_node){ CLASS_NONE, a, -1 }; + nfa[a+1].a = idx+1; + nfa[a+1].b = a; + nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; +} + +void nfa_questionmark(nfa_node *nfa, int idx, int a) { + nfa[idx] = (nfa_node){ CLASS_NONE, a, idx+1 }; + nfa[a+1].a = idx+1; + nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 }; +} + +int ast_to_nfa(nfa_node *nfa, ast_node *ast, int *idx, int ast_idx) { + ast_node current = ast[ast_idx]; + if (current.op == '*') { + int a = ast_to_nfa(nfa, ast, idx, current.a); + int r = *idx; + nfa_star(nfa, r, a); + *idx += 2; + return r; + } else if (current.op == '|') { + int a = ast_to_nfa(nfa, ast, idx, current.a); + int b = ast_to_nfa(nfa, ast, idx, current.b); + int r = *idx; + nfa_union(nfa, r, a, b); + *idx += 2; + return r; + } else if (current.op == '&') { + int a = ast_to_nfa(nfa, ast, idx, current.a); + int b = ast_to_nfa(nfa, ast, idx, current.b); + int r = *idx; + nfa_concat(nfa, r, a, b); + *idx += 2; + return r; + } else if (current.op == '+') { + int a = ast_to_nfa(nfa, ast, idx, current.a); + int r = *idx; + nfa_plus(nfa, r, a); + *idx += 2; + return r; + } else if (current.op == '?') { + int a = ast_to_nfa(nfa, ast, idx, current.a); + int r = *idx; + nfa_questionmark(nfa, r, a); + *idx += 2; + return r; + } else { + int r = *idx; + nfa_class(nfa, r, current.class); + *idx += 2; + return r; + } +} + +void nfa_set(nfa_node *nfa, int idx, int *stack, int *vis, class *set) { + int p = 0; + stack[p++] = idx; + while (p > 0) { + idx = stack[--p]; + nfa_node n = nfa[idx]; + if (vis[idx]) continue; + vis[idx] = 1; + if (!class_any(n.class)) { + /* both epsilon transitions */ + if (n.a != -1) stack[p++] = n.a; + if (n.b != -1) stack[p++] = n.b; + } else { + /* n.a 'n.class' transition */ + class_union(&set[n.a], n.class); + } + } +} \ No newline at end of file diff --git a/nfa.h b/nfa.h new file mode 100644 index 0000000..33b1f83 --- /dev/null +++ b/nfa.h @@ -0,0 +1,11 @@ + +#include "class.h" +#include "parse.h" + +typedef struct { + class class; + int a, b; +} nfa_node; + +int ast_to_nfa(nfa_node *nfa, ast_node *ast, int *idx, int ast_idx); +void nfa_set(nfa_node *nfa, int idx, int *stack, int *vis, class *set); \ No newline at end of file diff --git a/parse.c b/parse.c index d713cc4..3a85eb3 100644 --- a/parse.c +++ b/parse.c @@ -19,11 +19,17 @@ int parse_regex(parser_state *s); class parse_class(parser_state *s) { class c = CLASS_NONE; - char t; + bool neg = false; + char t = s->tokens[s->idx]; + if (t == '^') { + neg = true; + s->idx++; + } while (t = s->tokens[s->idx], t != ']') { s->idx++; class_union(&c, class_char(t)); } + if (neg) class_neg(&c); return c; } diff --git a/spec.tex b/spec.tex new file mode 100644 index 0000000..1f0090e --- /dev/null +++ b/spec.tex @@ -0,0 +1,102 @@ + +\documentclass{article} +\usepackage{amsfonts} + +\title{Reguláris kifejezések} +\author{Kurucz György} + +\begin{document} +\maketitle + +\begin{enumerate} +\item Karakterkészlet \par +A program az ASCII karakterkészlet karaktereit képes feldolgozni. +Egy helyes bemeneti fájl minden $c_i$ bájtjára igaz, +hogy $c_i \leq \mathtt{0x7F}$. + +\item Reguláris kifejezés karakterkészlete \par +A program által feldolgozott reguláris kifejezés karakterkészlete a bemeneti +karakterkészlet egy részhalmaza, azaz ugyanúgy ASCII kódolású. A reguláris kifejezés minden $r_i$ +karakterére igaz, hogy $\mathtt{0x20} \leq r_i \leq \mathtt{0x7E}$. Ez alapján +az alábbi karakterek szerepelhetnek egy szabályos reguláris kifejezésben: +\begin{center} \verb| !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ| +\verb$[\]^_`abcdefghijklmnopqrstuvwxyz{|}~$ +\end{center} + +\item Reguláris kifejezés nyelvtana \footnote{EBNF nyelvtan leírás} \par +\begin{center} +\begin{verbatim} +<regex> ::= <term> '|' <regex> | <term>; +<term> ::= { <factor> }; +<factor> ::= <base> { '*' | '+' | '?' }; +<base> ::= <char> | '\' <escape> | '[' <class> ']' | '(' <regex> ')'; +<class> ::= [ '^' ] { <char> | '\' <escape> }; +<escape> ::= 'd' | 'w' | 'W' | 's' | 'S' + | 't' | 'n' | 'v' | 'f' | 'r' | <special>; +<special> ::= '\' | '.' | '(' | ')' | '[' | ']' | '*' | '+' | '?'; +\end{verbatim} +\end{center} +A \verb$<char>$ osztály minden egyéb karaktert tartalmaz, ami nincs benne +a \verb$<special>$ osztályban. + +\item Reguláris kifejezés értelmezése \par +\begin{enumerate} + +\item Karakterosztályok \par +Minden \verb$<char>$ osztályban lévő karakter önmagát ismeri fel. +Minden \verb$'\' <escape>$ karakterpáros az \verb$<escape>$ osztály adott +karakterét ismeri fel. A további, speciális módon leírható karakterosztályok +az alábbiak szerint értelmezhetők: + +\begin{tabular}{|c|c|} +Reguláris kifejezés & Ekvivalens kifejezés, vagy karakterkód ($c_i$) \\ +\hline +\texttt{[} $a_1$ \texttt{-} $b_1$ $a_2$ \texttt{-} $b_2$ $\ldots$ $a_n$ \texttt{-} $b_n$ \texttt{]} & +$\exists k \in \mathbb{N}, k \leq n (a_k \leq c_i \leq b_k)$ \\ +\verb|\d| & \verb|[0-9]| \\ +\verb|\w| & \verb|[0-9A-Z_a-z]| \\ +\verb|\W| & \verb|[^\w]| \\ +\verb|\s| & \verb|[ \t\n\v\f\r]| \\ +\verb|\S| & \verb|[^\s]| \\ +\verb|\t| & $c_i = \mathtt{0x09}$ \\ +\verb|\n| & $c_i = \mathtt{0x0A}$ \\ +\verb|\v| & $c_i = \mathtt{0x0B}$ \\ +\verb|\f| & $c_i = \mathtt{0x0C}$ \\ +\verb|\r| & $c_i = \mathtt{0x0D}$ \\ + & +\end{tabular} + +\item Az \verb$'|'$ infix operátor, és a \verb$'*'$, \verb$'+'$, illetve \verb$'?'$ postfix operátorok \par +Az \verb$'|'$ operátor akkor ismer fel egy szöveget, ha azt a két operandusa közül valamelyik felismeri. +Példa: az \verb$ab|cd$ reguláris kifejezés az $\{\mathtt{ab}, \mathtt{cd} \}$ szöveghalmazt ismeri fel. \par +A \verb$'*'$ operátor akkor ismer fel egy szöveget, ha a paramétere valahányszor felismeri a szöveget +(ez lehet nulla is). Azaz például az \verb$a*$ reguláris kifejezés az +$\{\epsilon, \mathtt{a}, \mathtt{aa}, \ldots \}$ \footnotemark[\value{footnote}] +szöveghalmazt ismeri fel. \par +A \verb$'+'$ operátor akkor ismer fel egy szöveget, ha a paramétere legalább egyszer felismeri a szöveget. +Azaz például az \verb$a+$ reguláris kifejezés az $\{\mathtt{a}, \mathtt{aa}, \ldots \}$ +szöveghalmazt ismeri fel. \par +A \verb$'?'$ operátor akkor ismer fel egy szöveget, ha a paramétere nullaszor vagy egyszer felismeri a szöveget. +Azaz például az \verb$a?$ reguláris kifejezés az +$\{\epsilon, \mathtt{a} \}$ \footnotemark[\value{footnote}] +szöveghalmazt ismeri fel. \par +\footnotetext{ $\epsilon$ az üres szöveget jelöli } + +\item Zárójelezés \par +A reguláris kifejezés szabadon zárójelezhető az eredeti operátor precedencia felülírásának céljából. +A \verb$gra|ey$ reguláris kifejezés például a $\{\mathtt{gra}, \mathtt{ey} \}$ szöveghalmazt ismeri fel. +Ha viszont átírjuk a kifejezést \verb$gr(a|e)y$-ra, akkor már a $\{\mathtt{gray}, \mathtt{grey} \}$ +szöveghalmazt ismeri fel. + +\end{enumerate} +\item Felhasználói interfész \par +A programnak kötelező legalább egy parancssori paramétert átadni, +a feldolgozandó reguláris kifejezést. A program második paramétere a fájlnév, +ahonnan a bemenetet be kell olvasnia. Amennyiben ez nincs megadva, +a program a szabványos bemeneten érkező szöveget fogja feldolgozni. A program amennyiben kap, +úgy a többi paramétert figyelmen kívül fogja hagyni. Példa a szabályos használatra: \par +\begin{center} \verb$my_grep 'asd+' file.txt$ \end{center} + +\end{enumerate} + +\end{document} \ No newline at end of file -- GitLab