]> git.somenet.org - pub/jan/netsec2.git/blob - srcfeat_power.py
asdf
[pub/jan/netsec2.git] / srcfeat_power.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import sys
5 from csv import reader, Sniffer, writer, QUOTE_MINIMAL
6 from os.path import exists
7 from argparse import ArgumentParser
8
9
10 def principal_symbols(f_in, feat, sortby, output):
11     pkts = dict()
12     with open(f_in, 'r') as f:
13         csv = reader(f, delimiter=',', quotechar='"')
14
15         # Make sure that CSV file includes a header.
16         if not Sniffer().has_header(f.read(8192)):
17             sys.stderr.write("ERROR: CSV file has no header!\n")
18             sys.exit(2)
19         f.seek(0)
20
21         # Parse header of the input file
22         header = csv.next()
23         try:
24             sort_field = header.index(sortby)
25         except ValueError:
26             sys.stderr.write("ERROR: Cannot find column '%s' in " % sortby +
27                              "CSV file!\n")
28             sys.exit(2)
29         try:
30             feat_field = header.index(feat)
31         except ValueError:
32             sys.stderr.write("ERROR: Cannot find column '%s' " % str(feat) +
33                              "in CSV file!\n")
34             sys.stderr.write("These are the values that were found: " +
35                              "%s\n" % ", ".join(header))
36             sys.exit(2)
37
38         # Parse input file and build datastructure
39         for row in csv:
40             if row[sort_field] not in pkts:
41                 pkts[row[sort_field]] = dict()
42             if row[feat_field] not in pkts[row[sort_field]]:
43                 pkts[row[sort_field]][row[feat_field]] = 1
44             else:
45                 pkts[row[sort_field]][row[feat_field]] += 1
46
47     # Calculate multimodality estimation and write to output
48     csvfile = open(output, 'wb')
49     csvwriter = writer(csvfile, delimiter=',', quotechar='"',
50                        quoting=QUOTE_MINIMAL)
51     csvwriter.writerow([sortby, 'Number of packets',
52                         'Number of states', 'Principal symbols'])
53
54     for src in pkts.iterkeys():
55         tmp_max = [pkts[src][feat] for feat in pkts[src].iterkeys()]
56         tmp_sum = map(lambda x: pow(x, 2), tmp_max)
57         multimod = float(sum(tmp_sum)) / pow(max(tmp_max), 2)
58         csvwriter.writerow([src, sum(tmp_max), len(pkts[src]), multimod])
59     csvfile.close()
60
61
62 def _main():
63     # Argument handling
64     parser = ArgumentParser(description="Estimate principal symbols.",
65                             epilog='Example: %s --input ' % sys.argv[0] +
66                                     'csv_file.csv --feature "Time to live" ' +
67                                     '--sort-by \\ "Source" --output test.csv')
68     parser.add_argument("--input", type=str, required=True,
69                         help="Input CSV file.")
70     parser.add_argument("--output", type=str, required=True,
71                         help="Output CSV file.")
72     parser.add_argument("--feature", type=str, required=True,
73                         help="Feature to aggregate.")
74     parser.add_argument("--sort-by", type=str, default='Source',
75                         choices=['Source', 'Destination'], required=False,
76                         help="Sort by source or destination IP address.")
77     args = parser.parse_args()
78     if not exists(args.input):
79         sys.stderr.write("ERROR: Input file '%s' " % args.input +
80                          "does not exist!\n")
81         sys.exit(2)
82     if exists(args.output):
83         sys.stderr.write("ERROR: Output file '%s' " % args.output +
84                          "already exists!\n")
85         sys.exit(2)
86     principal_symbols(args.input, args.feature, args.sort_by, args.output)
87
88
89 if __name__ == '__main__':
90     _main()