summaryrefslogtreecommitdiff
path: root/ccan/strgrp/_info
blob: b58344fcf017909c6faa5ed1b0310e02f1134f0e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#include "config.h"
#include <stdio.h>
#include <string.h>

/**
 * strgrp - group/cluster similar strings.
 *
 * strgrp uses the Longest Common Subsequence (LCS) algorithm[1] to cluster
 * similar strings. It is governed by a threshold which is compared against
 * the computed normalised LCS length for all known groups.
 *
 * As a coarse and not entirely accurate summary, strgrp takes the following
 * steps:
 *
 * 1. For all known strings, calculate the normalised LCS value against the
 *    input string
 * 2. Find the maximum normalised LCS value and associated group
 * 3. If the calculated normalised LCS value exceeds the configured threshold,
 *    add the input string to the group, otherwise create a new group
 *
 * The clustering operation is expensive; LCS on its own is computationally
 * O(mn) on its two input strings and optimally requires O(min(m,n)) memory. In
 * general each input string should be compared against all known strings,
 * giving O(n^2) behaviour of the clustering algorithm on top of the O(mn) LCS
 * similarity measurement.
 *
 * strgrp tries to battle this complexity on several fronts:
 *
 * 1. Coarse reduction of the required comparisons
 * 1a. Each group has a 'key', which is the string that triggered the creation
 *     of the group
 * 1b. Input strings are only compared against group keys rather than all known
 *     strings, reducing the complexity to the current number of groups rather
 *     than all known strings. Note due the pathological case where the number
 *     of groups is equal to the number of known strings the algorithm still
 *     has O(n^2) computational complexity
 *
 * 2. Elimination of LCS computations that will never breach the configured
 *    threshold
 * 2a. This can be measured from the length of the compared strings
 * 2b. This avoids invoking the O(mn) behaviour of LCS
 *
 * 3. Caching of input strings and their associated group
 * 3a. By incurring the cost of a map's string hash function we may eliminate
 *     all calls to the LCS function for exact matches, potentially reducing
 *     the insertion to a constant-time operation.
 *
 * 4. Whilst the data dependencies of LCS prevent internally parallel
 *    implementations, LCS as a function can be applied in parallel. The code
 *    uses OpenMP to automatically and concurrently distribute scoring of the
 *    input string against group keys across threads.
 *
 * [1] https://en.wikipedia.org/wiki/Longest_common_subsequence_problem
 *
 * License: LGPL
 * Author: Andrew Jeffery <andrew@aj.id.au>
 *
 * Example:
 *     #define BUF_SIZE 512
 *     FILE *const f = fdopen(0, "r");
 *     char *buf = malloc(BUF_SIZE);
 *     struct strgrp *ctx = strgrp_new(0.85);
 *     while(fgets(buf, BUF_SIZE, f)) {
 *         buf[strcspn(buf, "\r\n")] = '\0';
 *         if (!strgrp_add(ctx, buf, NULL)) {
 *             printf("Failed to classify %s\n", buf);
 *         }
 *     }
 *
 *     // Re-implement something similar to strgrp_print() via API as an
 *     // example
 *     struct strgrp_iter *iter = strgrp_iter_new(ctx);
 *     const struct strgrp_grp *grp;
 *     while(NULL != (grp = strgrp_iter_next(iter))) {
 *         printf("%s\n", strgrp_grp_key(grp));
 *         struct strgrp_grp_iter *grp_iter = strgrp_grp_iter_new(grp);
 *         const struct strgrp_item *item;
 *         while(NULL != (item = strgrp_grp_iter_next(grp_iter))) {
 *              printf("\t%s\n", strgrp_item_key(item));
 *         }
 *         strgrp_grp_iter_free(grp_iter);
 *     }
 *     strgrp_iter_free(iter);
 *     strgrp_free(ctx);
 *     free(buf);
 *     fclose(f);
 */
int main(int argc, char *argv[]) {
    if (argc != 2) {
        return 1;
    }

    if (strcmp(argv[1], "depends") == 0) {
        printf("ccan/darray\n");
        printf("ccan/stringmap\n");
        printf("ccan/tal\n");
        printf("ccan/tal/str\n");
        return 0;
    }

    if (strcmp(argv[1], "testdepends") == 0) {
        printf("ccan/str\n");
        return 0;
    }

#if HAVE_OPENMP
    if (strcmp(argv[1], "cflags") == 0) {
        printf("-fopenmp\n");
        return 0;
    }
#endif

    return 1;
}