]> git.somenet.org - pub/jan/netsec2.git/blob - srcfeat_power.py
GITOLITE.txt
[pub/jan/netsec2.git] / srcfeat_power.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 from csv import reader, Sniffer, writer, QUOTE_MINIMAL
6 from os.path import exists
7 from argparse import ArgumentParser
8
9
10 def principal_symbols(f_in, feat, sortby):
11     pkts = dict()
12     with open(f_in, 'r') as f:
13         csv = reader(f, delimiter=',', quotechar='"')
14
15         # Make sure that CSV file includes a header.
16         if not Sniffer().has_header(f.read(8192)):
17             sys.stderr.write("ERROR: CSV file has no header!\n")
18             sys.exit(2)
19         f.seek(0)
20
21         # Parse header of the input file
22         header = csv.next()
23         try:
24             sort_field = header.index(sortby)
25         except ValueError:
26             sys.stderr.write("ERROR: Cannot find column '%s' in " % sortby +
27                              "CSV file!\n")
28             sys.exit(2)
29         try:
30             feat_field = header.index(feat)
31         except ValueError:
32             sys.stderr.write("ERROR: Cannot find column '%s' " % str(feat) +
33                              "in CSV file!\n")
34             sys.stderr.write("These are the values that were found: " +
35                              "%s\n" % ", ".join(header))
36             sys.exit(2)
37
38         # Parse input file and build datastructure
39         for row in csv:
40             if row[sort_field] not in pkts:
41                 pkts[row[sort_field]] = dict()
42             if row[feat_field] not in pkts[row[sort_field]]:
43                 pkts[row[sort_field]][row[feat_field]] = 1
44             else:
45                 pkts[row[sort_field]][row[feat_field]] += 1
46
47     # Calculate multimodality estimation and write to output
48     csvfile = open("/dev/stdout", 'wb')
49     csvwriter = writer(csvfile, delimiter=',', quotechar='"',
50                        quoting=QUOTE_MINIMAL)
51     csvwriter.writerow([sortby, 'Number of packets',
52                         'Number of states', 'Principal symbols'])
53
54     for src in pkts.iterkeys():
55         tmp_max = [pkts[src][feat] for feat in pkts[src].iterkeys()]
56         tmp_sum = map(lambda x: pow(x, 2), tmp_max)
57         multimod = float(sum(tmp_sum)) / pow(max(tmp_max), 2)
58         csvwriter.writerow([src, sum(tmp_max), len(pkts[src]), multimod])
59     csvfile.close()
60
61
62 def _main():
63     # Argument handling
64     parser = ArgumentParser(description="Estimate principal symbols.",
65                             epilog='Example: %s --input ' % sys.argv[0] +
66                                     'csv_file.csv --feature "Time to live" ' +
67                                     '--sort-by \\ "Source"')
68     parser.add_argument("--input", type=str, required=True,
69                         help="Input CSV file.")
70     parser.add_argument("--feature", type=str, required=True,
71                         help="Feature to aggregate.")
72     parser.add_argument("--sort-by", type=str, default='Source',
73                         choices=['Source', 'Destination'], required=False,
74                         help="Sort by source or destination IP address.")
75     args = parser.parse_args()
76     if not exists(args.input):
77         sys.stderr.write("ERROR: Input file '%s' " % args.input +
78                          "does not exist!\n")
79         sys.exit(2)
80     principal_symbols(args.input, args.feature, args.sort_by)
81
82
83 if __name__ == '__main__':
84     _main()