From ec7cf149e9063772eb884c3a7d44b65291bee860 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kurucz=20Gy=C3=B6rgy?= <kuruczgyurci@hotmail.com>
Date: Thu, 4 Oct 2018 00:04:53 +0200
Subject: [PATCH] Modularized the codebase further. Added a specification file.

---
 .gitignore |   2 +
 Makefile   |   2 +-
 class.c    |  36 ++++++++--
 class.h    |   8 ++-
 dfa.c      | 197 ++++++++++++++++-------------------------------------
 main.c     |  35 ++--------
 nfa.c      | 101 +++++++++++++++++++++++++++
 nfa.h      |  11 +++
 parse.c    |   8 ++-
 spec.tex   | 102 +++++++++++++++++++++++++++
 10 files changed, 320 insertions(+), 182 deletions(-)
 create mode 100644 nfa.c
 create mode 100644 nfa.h
 create mode 100644 spec.tex

diff --git a/.gitignore b/.gitignore
index ba2906d..5cf1daf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 main
+spec.*
+!spec.tex
diff --git a/Makefile b/Makefile
index 2c2e527..7bcedbf 100644
--- a/Makefile
+++ b/Makefile
@@ -1,2 +1,2 @@
 all:
-	gcc -Wall -pedantic -g main.c class.c dfa.c parse.c -o main
\ No newline at end of file
+	gcc -Wall -pedantic -g main.c class.c nfa.c dfa.c parse.c -o main
\ No newline at end of file
diff --git a/class.c b/class.c
index 9f837a1..253409f 100644
--- a/class.c
+++ b/class.c
@@ -1,16 +1,22 @@
 #include "class.h"
 #include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
 
 void class_union(class *a, class b) {
-    for (int i = 0; i < 4; i++) (a->d)[i] |= (b.d)[i];
+    for (int i = 0; i < CLASS_LEN; i++) (a->d)[i] |= (b.d)[i];
+}
+
+void class_neg(class *a) {
+    for (int i = 0; i < CLASS_LEN; i++) a->d[i] = ~(a->d[i]);
 }
 
 bool class_any(class c) {
-    return c.d[0] | c.d[1] | c.d[2] | c.d[3];    
+    return c.d[0] | c.d[1];    
 }
 
 bool class_eq(class a, class b) {
-    for (int i = 0; i < 4; i++) if (a.d[i] != b.d[i]) return false;
+    for (int i = 0; i < CLASS_LEN; i++) if (a.d[i] != b.d[i]) return false;
     return true;
 }
 
@@ -18,15 +24,31 @@ bool class_has(class a, char c) {
     return a.d[c/64] & (1ULL << c%64);
 }
 
-char* class_str(class c) {
-    char *res = (char*)malloc(0x101);
+char* class_str(class c, int *out_len) {
+    int bits = 0;
+    for (int i = 0; i < CLASS_BITS; i++) {
+        if (c.d[i/64] & (1ULL << i%64)) bits++;
+    }
+
+    char *res = (char*)malloc(CLASS_BITS*3);
     int idx = 0;
-    for (int i = 0; i <= 0xFF; i++) {
+    if (bits > CLASS_BITS/2) {
+        class_neg(&c);
+        res[idx++] = '^';
+    }
+
+    for (int i = 0; i < CLASS_BITS; i++) {
         if (c.d[i/64] & (1ULL << i%64)) {
-            res[idx++] = (char)i;
+            char ch = (char)i;
+            if (iscntrl(ch) || ch == '\\' || ch == '\"') {
+                idx += sprintf(res+idx, "\\%02X", ch);
+            } else {
+                res[idx++] = ch;
+            }
         }
     }
     res[idx] = '\0';
+    *out_len = idx;
     return res;
 }
 
diff --git a/class.h b/class.h
index 58dddc5..5e8608b 100644
--- a/class.h
+++ b/class.h
@@ -1,19 +1,21 @@
-
 #ifndef _CLASS_H_
 #define _CLASS_H_
 
 #include <stdbool.h>
 
+#define CLASS_BITS (1 << 7)
+#define CLASS_LEN CLASS_BITS/64
 typedef struct {
-    unsigned long long d[4];
+    unsigned long long d[CLASS_LEN];
 } class;
 static const class CLASS_NONE = { {0} };
 
 void class_union(class *a, class b);
+void class_neg(class *a);
 bool class_any(class c);
 bool class_eq(class a, class b);
 bool class_has(class a, char c);
-char* class_str(class c);
+char* class_str(class c, int *out_len);
 class class_char(char c);
 
 #endif
\ No newline at end of file
diff --git a/dfa.c b/dfa.c
index 8f873c6..bfe8ace 100644
--- a/dfa.c
+++ b/dfa.c
@@ -1,4 +1,5 @@
 #include "dfa.h"
+#include "nfa.h"
 
 #include <stdlib.h>
 #include <string.h>
@@ -16,118 +17,15 @@ typedef struct {
     bool end;
 } dfa_temp_node;
 
-typedef struct {
-    class class;
-    int a, b;
-} nfa_node;
-
-void nfa_class(nfa_node *nfa, int idx, class class) {
-    nfa[idx] = (nfa_node){ class, idx+1, -1 };
-    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
-}
-
-void nfa_union(nfa_node *nfa, int idx, int a, int b) {
-    nfa[idx] = (nfa_node){ CLASS_NONE, a, b };
-    nfa[a+1].a = idx+1;
-    nfa[b+1].a = idx+1;
-    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
-}
-
-void nfa_concat(nfa_node *nfa, int idx, int a, int b) {
-    nfa[idx] = (nfa_node){ CLASS_NONE, a, -1 };
-    nfa[a+1].a = b;
-    nfa[b+1].a = idx+1;
-    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
-}
-
-void nfa_star(nfa_node *nfa, int idx, int a) {
-    nfa[idx] = (nfa_node){ CLASS_NONE, a, idx+1 };
-    nfa[a+1].a = idx+1;
-    nfa[a+1].b = a;
-    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
-}
-
-void nfa_plus(nfa_node *nfa, int idx, int a) {
-    nfa[idx] = (nfa_node){ CLASS_NONE, a, -1 };
-    nfa[a+1].a = idx+1;
-    nfa[a+1].b = a;
-    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
-}
-
-void nfa_questionmark(nfa_node *nfa, int idx, int a) {
-    nfa[idx] = (nfa_node){ CLASS_NONE, a, idx+1 };
-    nfa[a+1].a = idx+1;
-    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
-}
-
-int ast_to_nfa(nfa_node *nfa, ast_node *ast, int *idx, int ast_idx) {
-    ast_node current = ast[ast_idx];
-    if (current.op == '*') {
-        int a = ast_to_nfa(nfa, ast, idx, current.a);
-        int r = *idx;
-        nfa_star(nfa, r, a);
-        *idx += 2;
-        return r;
-    } else if (current.op == '|') {
-        int a = ast_to_nfa(nfa, ast, idx, current.a);
-        int b = ast_to_nfa(nfa, ast, idx, current.b);
-        int r = *idx;
-        nfa_union(nfa, r, a, b);
-        *idx += 2;
-        return r;
-    } else if (current.op == '&') {
-        int a = ast_to_nfa(nfa, ast, idx, current.a);
-        int b = ast_to_nfa(nfa, ast, idx, current.b);
-        int r = *idx;
-        nfa_concat(nfa, r, a, b);
-        *idx += 2;
-        return r;
-    } else if (current.op == '+') {
-        int a = ast_to_nfa(nfa, ast, idx, current.a);
-        int r = *idx;
-        nfa_plus(nfa, r, a);
-        *idx += 2;
-        return r;
-    } else if (current.op == '?') {
-        int a = ast_to_nfa(nfa, ast, idx, current.a);
-        int r = *idx;
-        nfa_questionmark(nfa, r, a);
-        *idx += 2;
-        return r;
-    } else {
-        int r = *idx;
-        nfa_class(nfa, r, current.class);
-        *idx += 2;
-        return r;
-    }
-}
-
-void nfa_set(nfa_node *nfa, int idx, int *stack, int *vis, class *set) {
-    int p = 0;
-    stack[p++] = idx;
-    while (p > 0) {
-        idx = stack[--p];
-        nfa_node n = nfa[idx];
-        if (vis[idx]) continue;
-        vis[idx] = 1;
-        if (!class_any(n.class)) {
-            /* both epsilon transitions */
-            if (n.a != -1) stack[p++] = n.a;
-            if (n.b != -1) stack[p++] = n.b;
-        } else {
-            /* n.a 'n.class' transition */
-            class_union(&set[n.a], n.class);
-        }
-    }
-}
-
 dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) {
     int nfa_begin = nfa_count - 2;
     int nfa_end = nfa_count - 1;
 
-    int *stack =(int*)malloc(sizeof(int)*nfa_count);
+    //variables used in the nfa_set function
+    int *stack = (int*)malloc(sizeof(int)*nfa_count);
     int *vis =  (int*)malloc(sizeof(int)*nfa_count);
     class *class_set = (class*)malloc(sizeof(class)*nfa_count);
+
     dfa_temp_node *dfa_nodes = (dfa_temp_node*)malloc(sizeof(dfa_temp_node)*nfa_count);
     int dfa_count = 0;
     int *dfa_stack = (int*)malloc(sizeof(int)*nfa_count);
@@ -146,17 +44,16 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) {
         dfa_count++;
     }
     while (p > 0) {
-        int dfa_idx = dfa_stack[--p];
-        dfa_temp_node *current = &dfa_nodes[dfa_idx];
-        printf("processing dfa[%d]; set={", dfa_idx);
-        for (int i = 0; i < current->n_set; i++) {
-            printf("%d,", current->set[i]);
-        }
+        int current_idx = dfa_stack[--p];
+        dfa_temp_node *current = &dfa_nodes[current_idx];
+        printf("processing dfa[%d]; set={", current_idx);
+        for (int i = 0; i < current->n_set; i++) printf("%d,", current->set[i]);
         printf("}\n");
-        int **reachable_by_symbol = (int**)malloc(sizeof(int*)*0xFF);
-        int *symbol_lengths = (int*)malloc(sizeof(int)*0xFF);
-        char *symbols = (char*)malloc(sizeof(char)*0xFF);
-        int symbol_count = 0;
+        
+        int **edges = (int**)malloc(sizeof(int*)*0xFF);
+        int *edge_set_lengths = (int*)malloc(sizeof(int)*0xFF);
+        char *edge_symbols = (char*)malloc(sizeof(char)*0xFF);
+        int edge_count = 0;
         for (int i = 0; i < current->n_set; i++) {
             //reset vectors
             for (int k = 0; k < nfa_count; k++) {
@@ -169,45 +66,44 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) {
             current->end = current->end || vis[nfa_end];
             //calculate all possible symbols
             class all_classes = CLASS_NONE;
-            for (int f = 0; f < nfa_count; f++)
-                class_union(&all_classes, class_set[f]);
+            for (int f = 0; f < nfa_count; f++) class_union(&all_classes, class_set[f]);
             //iterate possible symbols
-            char *all_classes_str = class_str(all_classes);
-            printf("all possible reachable classes from nfa[%d]: %s\n", nfa_node_idx, all_classes_str);
-            for (char* ch = all_classes_str; *ch; ch++) {
-                int symbol = *ch;
+            for (int ch = 0; ch < CLASS_BITS; ch++) {
+                int symbol = (char)ch;
+                if (!class_has(all_classes, symbol)) continue;
                 int symbol_idx = -1;
-                for (int f = 0; f < symbol_count; f++) {
-                    if (symbols[f] == symbol) {
+                for (int f = 0; f < edge_count; f++) {
+                    if (edge_symbols[f] == symbol) {
                         symbol_idx = f;
                         break;
                     }
                 }
                 if (symbol_idx == -1) {
-                    reachable_by_symbol[symbol_count] = (int*)malloc(sizeof(int)*nfa_count);
-                    symbol_lengths[symbol_count] = 0;
-                    symbols[symbol_count] = symbol;
-                    symbol_idx = symbol_count++;
+                    edges[edge_count] = (int*)malloc(sizeof(int)*nfa_count);
+                    edge_set_lengths[edge_count] = 0;
+                    edge_symbols[edge_count] = symbol;
+                    symbol_idx = edge_count++;
                 }
                 //calculate reachable nfa node set for this symbol
                 printf("\ttransition set from nfa[%d] for symbol '%c': {", nfa_node_idx, symbol);
                 for (int f = 0; f < nfa_count; f++) {
                     if (class_has(class_set[f], symbol)) {
-                        reachable_by_symbol[symbol_idx][symbol_lengths[symbol_idx]++] = f;
+                        edges[symbol_idx][edge_set_lengths[symbol_idx]++] = f;
                         printf("%d,", f);
                     }
                 }
                 printf("}\n");
             }
         }
-        int *transitions = (int*)malloc(sizeof(int)*symbol_count);
-        class *transition_classes = (class*)malloc(sizeof(class)*symbol_count);
-        for (int i = 0; i < symbol_count; i++) {
-            qsort(reachable_by_symbol[i], symbol_lengths[i], sizeof(int), int_cmp);
+        int *transitions = (int*)malloc(sizeof(int)*edge_count);
+        class *transition_classes = (class*)malloc(sizeof(class)*edge_count);
+        int reduced_edge_count = 0;
+        for (int i = 0; i < edge_count; i++) {
+            qsort(edges[i], edge_set_lengths[i], sizeof(int), int_cmp);
             int target = -1;
             for (int k = 0; k < dfa_count; k++) {
-                if (dfa_nodes[k].n_set != symbol_lengths[i]) continue;
-                if (memcmp(dfa_nodes[k].set, reachable_by_symbol[i], symbol_lengths[i]) == 0) { //match
+                if (dfa_nodes[k].n_set != edge_set_lengths[i]) continue;
+                if (memcmp(dfa_nodes[k].set, edges[i], edge_set_lengths[i]) == 0) { //match
                     target = k;
                     break;
                 }
@@ -215,22 +111,40 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) {
             if (target == -1) {
                 printf("\t\tpushing new dfa_node dfa[%d]\n", dfa_count);
                 dfa_nodes[dfa_count] = (dfa_temp_node){
-                    symbol_lengths[i], -1,
-                    reachable_by_symbol[i],
+                    edge_set_lengths[i], -1,
+                    edges[i],
                     NULL, NULL, false
                 };
                 dfa_stack[p++] = dfa_count;
                 target = dfa_count;
                 dfa_count++;
             }
-            transitions[i] = target;
-            transition_classes[i] = class_char(symbols[i]);
+            int reusable_edge = -1;
+            for (int k = 0; k < reduced_edge_count; k++) {
+                if (transitions[k] == target) {
+                    reusable_edge = k;
+                    break;
+                }
+            }
+            class c = class_char(edge_symbols[i]);
+            if (reusable_edge != -1) {
+                class_union(&transition_classes[reusable_edge], c);
+            } else {
+                transitions[reduced_edge_count] = target;
+                transition_classes[reduced_edge_count] = c;
+                reduced_edge_count++;
+            }
         }
-        current->n_transitions = symbol_count;
+        current->n_transitions = reduced_edge_count;
         current->transitions = transitions;
         current->transition_classes = transition_classes;
     }
 
+    free(vis);
+    free(stack);
+    free(class_set);
+    free(dfa_stack);
+
     dfa dfa_obj = {
         (dfa_node*)malloc(sizeof(dfa_node) * dfa_count),
         dfa_count,
@@ -250,6 +164,9 @@ dfa nfa_to_dfa(nfa_node *nfa, int nfa_count) {
         }
         dfa_obj.nodes[i] = d;
     }
+
+    free(dfa_nodes);
+
     return dfa_obj;
 }
 
diff --git a/main.c b/main.c
index 1da149c..0eaf1e6 100644
--- a/main.c
+++ b/main.c
@@ -5,33 +5,6 @@
 #include <stdlib.h>
 
 int main(int argc, char **argv) {
-    // ast_node ast[] = {
-    //     { '*', CLASS_NONE, 1, -1 },
-    //     { '|', CLASS_NONE, 2, 3 },
-    //     { '\0', class_char('a'), -1, -1 },
-    //     { '\0', class_char('b'), -1, -1 }
-    // };
-    // int ast_count = 4;
-
-    // ast_node ast[] = {
-    //     /* 0*/{ '*', CLASS_NONE, 1, -1 },
-    //     /* 1*/{ '|', CLASS_NONE, 2, 7 },
-    //     /* 2*/{ '|', CLASS_NONE, 3, 4 },
-    //     /* 3*/{ '0', class_char('a'), -1, -1 },
-    //     /* 4*/{ '&', CLASS_NONE, 5, 6 },
-    //     /* 5*/{ '0', class_char('a'), -1, -1 },
-    //     /* 6*/{ '0', class_char('b'), -1, -1 },
-    //     /* 7*/{ '&', CLASS_NONE, 8, 9 },
-    //     /* 8*/{ '0', class_char('a'), -1, -1 },
-    //     /* 9*/{ '&', CLASS_NONE, 10, 11 },
-    //     /*10*/{ '0', class_char('b'), -1, -1 },
-    //     /*11*/{ '&', CLASS_NONE, 12, 13 },
-    //     /*12*/{ '0', class_char('c'), -1, -1 },
-    //     /*13*/{ '*', CLASS_NONE, 14, -1 },
-    //     /*14*/{ '0', class_char('x'), -1, -1 },
-    // };
-    //int ast_count = sizeof(ast) / sizeof(ast_node);
-
     if (argc < 2) {
         printf("No regex argument found!\n");
         return 1;
@@ -43,7 +16,8 @@ int main(int argc, char **argv) {
     printf("Parsed regex '%s' [root=%d]:\n", r, ast_root);
     for (int i = 0; i < ast_count; i++) {
         ast_node a = parsed_ast[i];
-        char* str = class_str(a.class);
+        int out_len;
+        char* str = class_str(a.class, &out_len);
         printf("\tast[%d] = {%c, %s, %d, %d}\n", i, a.op, str, a.a, a.b);
         free(str);
     }
@@ -56,8 +30,9 @@ int main(int argc, char **argv) {
         dfa_node node = d.nodes[i];
         printf("dfa%c%c[%d] transitions: { ", (i == d.begin ? '>' : ' '), (node.end ? '!' : ' '), i);
         for (int k = 0; k < node.edges; k++) {
-            char* str = class_str(node.what[k]);
-            printf("%s:%d, ", str, node.to[k]);
+            int out_len;
+            char* str = class_str(node.what[k], &out_len);
+            printf("\"%s\":%d, ", str, node.to[k]);
             free(str);
         }
         printf("}\n");
diff --git a/nfa.c b/nfa.c
new file mode 100644
index 0000000..74875e8
--- /dev/null
+++ b/nfa.c
@@ -0,0 +1,101 @@
+#include "nfa.h"
+
+void nfa_class(nfa_node *nfa, int idx, class class) {
+    nfa[idx] = (nfa_node){ class, idx+1, -1 };
+    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
+}
+
+void nfa_union(nfa_node *nfa, int idx, int a, int b) {
+    nfa[idx] = (nfa_node){ CLASS_NONE, a, b };
+    nfa[a+1].a = idx+1;
+    nfa[b+1].a = idx+1;
+    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
+}
+
+void nfa_concat(nfa_node *nfa, int idx, int a, int b) {
+    nfa[idx] = (nfa_node){ CLASS_NONE, a, -1 };
+    nfa[a+1].a = b;
+    nfa[b+1].a = idx+1;
+    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
+}
+
+void nfa_star(nfa_node *nfa, int idx, int a) {
+    nfa[idx] = (nfa_node){ CLASS_NONE, a, idx+1 };
+    nfa[a+1].a = idx+1;
+    nfa[a+1].b = a;
+    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
+}
+
+void nfa_plus(nfa_node *nfa, int idx, int a) {
+    nfa[idx] = (nfa_node){ CLASS_NONE, a, -1 };
+    nfa[a+1].a = idx+1;
+    nfa[a+1].b = a;
+    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
+}
+
+void nfa_questionmark(nfa_node *nfa, int idx, int a) {
+    nfa[idx] = (nfa_node){ CLASS_NONE, a, idx+1 };
+    nfa[a+1].a = idx+1;
+    nfa[idx+1] = (nfa_node){ CLASS_NONE, -1, -1 };
+}
+
+int ast_to_nfa(nfa_node *nfa, ast_node *ast, int *idx, int ast_idx) {
+    ast_node current = ast[ast_idx];
+    if (current.op == '*') {
+        int a = ast_to_nfa(nfa, ast, idx, current.a);
+        int r = *idx;
+        nfa_star(nfa, r, a);
+        *idx += 2;
+        return r;
+    } else if (current.op == '|') {
+        int a = ast_to_nfa(nfa, ast, idx, current.a);
+        int b = ast_to_nfa(nfa, ast, idx, current.b);
+        int r = *idx;
+        nfa_union(nfa, r, a, b);
+        *idx += 2;
+        return r;
+    } else if (current.op == '&') {
+        int a = ast_to_nfa(nfa, ast, idx, current.a);
+        int b = ast_to_nfa(nfa, ast, idx, current.b);
+        int r = *idx;
+        nfa_concat(nfa, r, a, b);
+        *idx += 2;
+        return r;
+    } else if (current.op == '+') {
+        int a = ast_to_nfa(nfa, ast, idx, current.a);
+        int r = *idx;
+        nfa_plus(nfa, r, a);
+        *idx += 2;
+        return r;
+    } else if (current.op == '?') {
+        int a = ast_to_nfa(nfa, ast, idx, current.a);
+        int r = *idx;
+        nfa_questionmark(nfa, r, a);
+        *idx += 2;
+        return r;
+    } else {
+        int r = *idx;
+        nfa_class(nfa, r, current.class);
+        *idx += 2;
+        return r;
+    }
+}
+
+void nfa_set(nfa_node *nfa, int idx, int *stack, int *vis, class *set) {
+    int p = 0;
+    stack[p++] = idx;
+    while (p > 0) {
+        idx = stack[--p];
+        nfa_node n = nfa[idx];
+        if (vis[idx]) continue;
+        vis[idx] = 1;
+        if (!class_any(n.class)) {
+            /* both epsilon transitions */
+            if (n.a != -1) stack[p++] = n.a;
+            if (n.b != -1) stack[p++] = n.b;
+        } else {
+            /* n.a 'n.class' transition */
+            class_union(&set[n.a], n.class);
+        }
+    }
+}
\ No newline at end of file
diff --git a/nfa.h b/nfa.h
new file mode 100644
index 0000000..33b1f83
--- /dev/null
+++ b/nfa.h
@@ -0,0 +1,11 @@
+
+#include "class.h"
+#include "parse.h"
+
+typedef struct {
+    class class;
+    int a, b;
+} nfa_node;
+
+int ast_to_nfa(nfa_node *nfa, ast_node *ast, int *idx, int ast_idx);
+void nfa_set(nfa_node *nfa, int idx, int *stack, int *vis, class *set);
\ No newline at end of file
diff --git a/parse.c b/parse.c
index d713cc4..3a85eb3 100644
--- a/parse.c
+++ b/parse.c
@@ -19,11 +19,17 @@ int parse_regex(parser_state *s);
 
 class parse_class(parser_state *s) {
     class c = CLASS_NONE;
-    char t;
+    bool neg = false;
+    char t = s->tokens[s->idx];
+    if (t == '^') {
+        neg = true;
+        s->idx++;
+    }
     while (t = s->tokens[s->idx], t != ']') {
         s->idx++;
         class_union(&c, class_char(t));
     }
+    if (neg) class_neg(&c);
     return c;
 }
 
diff --git a/spec.tex b/spec.tex
new file mode 100644
index 0000000..1f0090e
--- /dev/null
+++ b/spec.tex
@@ -0,0 +1,102 @@
+
+\documentclass{article}
+\usepackage{amsfonts}
+
+\title{Reguláris kifejezések}
+\author{Kurucz György}
+
+\begin{document}
+\maketitle
+
+\begin{enumerate}
+\item Karakterkészlet \par
+A program az ASCII karakterkészlet karaktereit képes feldolgozni.
+Egy helyes bemeneti fájl minden $c_i$ bájtjára igaz,
+hogy $c_i \leq \mathtt{0x7F}$.
+
+\item Reguláris kifejezés karakterkészlete \par
+A program által feldolgozott reguláris kifejezés karakterkészlete a bemeneti
+karakterkészlet egy részhalmaza, azaz ugyanúgy ASCII kódolású. A reguláris kifejezés minden $r_i$
+karakterére igaz, hogy $\mathtt{0x20} \leq r_i \leq \mathtt{0x7E}$. Ez alapján
+az alábbi karakterek szerepelhetnek egy szabályos reguláris kifejezésben:
+\begin{center} \verb| !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ|
+\verb$[\]^_`abcdefghijklmnopqrstuvwxyz{|}~$
+\end{center}
+
+\item Reguláris kifejezés nyelvtana \footnote{EBNF nyelvtan leírás} \par
+\begin{center}
+\begin{verbatim}
+<regex> ::= <term> '|' <regex> | <term>;
+<term> ::= { <factor> };
+<factor> ::= <base> { '*' | '+' | '?' };
+<base> ::= <char> | '\' <escape> | '[' <class> ']' | '(' <regex> ')';
+<class> ::= [ '^' ] { <char> | '\' <escape> };
+<escape> ::= 'd' | 'w' | 'W' | 's' | 'S'
+    | 't' | 'n' | 'v' | 'f' | 'r' | <special>;
+<special> ::= '\' | '.' | '(' | ')' | '[' | ']' | '*' | '+' | '?';
+\end{verbatim}
+\end{center}
+A \verb$<char>$ osztály minden egyéb karaktert tartalmaz, ami nincs benne
+a \verb$<special>$ osztályban.
+
+\item Reguláris kifejezés értelmezése \par
+\begin{enumerate}
+
+\item Karakterosztályok \par
+Minden \verb$<char>$ osztályban lévő karakter önmagát ismeri fel.
+Minden \verb$'\' <escape>$ karakterpáros az \verb$<escape>$ osztály adott
+karakterét ismeri fel. A további, speciális módon leírható karakterosztályok
+az alábbiak szerint értelmezhetők:
+
+\begin{tabular}{|c|c|}
+Reguláris kifejezés & Ekvivalens kifejezés, vagy karakterkód ($c_i$) \\
+\hline
+\texttt{[} $a_1$ \texttt{-} $b_1$ $a_2$ \texttt{-} $b_2$ $\ldots$ $a_n$ \texttt{-} $b_n$ \texttt{]} &
+$\exists k \in \mathbb{N}, k \leq n (a_k \leq c_i \leq b_k)$ \\
+\verb|\d| & \verb|[0-9]| \\
+\verb|\w| & \verb|[0-9A-Z_a-z]| \\
+\verb|\W| & \verb|[^\w]| \\
+\verb|\s| & \verb|[ \t\n\v\f\r]| \\
+\verb|\S| & \verb|[^\s]| \\
+\verb|\t| & $c_i = \mathtt{0x09}$ \\
+\verb|\n| & $c_i = \mathtt{0x0A}$ \\
+\verb|\v| & $c_i = \mathtt{0x0B}$ \\
+\verb|\f| & $c_i = \mathtt{0x0C}$ \\
+\verb|\r| & $c_i = \mathtt{0x0D}$ \\
+ & 
+\end{tabular}
+
+\item Az \verb$'|'$ infix operátor, és a \verb$'*'$, \verb$'+'$, illetve \verb$'?'$ postfix operátorok \par
+Az \verb$'|'$ operátor akkor ismer fel egy szöveget, ha azt a két operandusa közül valamelyik felismeri.
+Példa: az \verb$ab|cd$ reguláris kifejezés az $\{\mathtt{ab}, \mathtt{cd} \}$ szöveghalmazt ismeri fel. \par
+A \verb$'*'$ operátor akkor ismer fel egy szöveget, ha a paramétere valahányszor felismeri a szöveget
+(ez lehet nulla is). Azaz például az \verb$a*$ reguláris kifejezés az
+$\{\epsilon, \mathtt{a}, \mathtt{aa}, \ldots \}$ \footnotemark[\value{footnote}]
+szöveghalmazt ismeri fel. \par
+A \verb$'+'$ operátor akkor ismer fel egy szöveget, ha a paramétere legalább egyszer felismeri a szöveget.
+Azaz például az \verb$a+$ reguláris kifejezés az $\{\mathtt{a}, \mathtt{aa}, \ldots \}$
+szöveghalmazt ismeri fel. \par
+A \verb$'?'$ operátor akkor ismer fel egy szöveget, ha a paramétere nullaszor vagy egyszer felismeri a szöveget.
+Azaz például az \verb$a?$ reguláris kifejezés az
+$\{\epsilon, \mathtt{a} \}$ \footnotemark[\value{footnote}]
+szöveghalmazt ismeri fel. \par
+\footnotetext{ $\epsilon$ az üres szöveget jelöli }
+
+\item Zárójelezés \par
+A reguláris kifejezés szabadon zárójelezhető az eredeti operátor precedencia felülírásának céljából.
+A \verb$gra|ey$ reguláris kifejezés például a $\{\mathtt{gra}, \mathtt{ey} \}$ szöveghalmazt ismeri fel.
+Ha viszont átírjuk a kifejezést \verb$gr(a|e)y$-ra, akkor már a $\{\mathtt{gray}, \mathtt{grey} \}$
+szöveghalmazt ismeri fel.
+
+\end{enumerate}
+\item Felhasználói interfész \par
+A programnak kötelező legalább egy parancssori paramétert átadni,
+a feldolgozandó reguláris kifejezést. A program második paramétere a fájlnév,
+ahonnan a bemenetet be kell olvasnia. Amennyiben ez nincs megadva,
+a program a szabványos bemeneten érkező szöveget fogja feldolgozni. A program amennyiben kap,
+úgy a többi paramétert figyelmen kívül fogja hagyni. Példa a szabályos használatra: \par
+\begin{center} \verb$my_grep 'asd+' file.txt$ \end{center}
+
+\end{enumerate}
+
+\end{document}
\ No newline at end of file
-- 
GitLab