srcfeat_power.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import sys
   5 from csv import reader, Sniffer, writer, QUOTE_MINIMAL
   6 from os.path import exists
   7 from argparse import ArgumentParser
   8
   9
  10 def principal_symbols(f_in, feat, sortby, output):
  11     pkts = dict()
  12     with open(f_in, 'r') as f:
  13         csv = reader(f, delimiter=',', quotechar='"')
  14
  15         # Make sure that CSV file includes a header.
  16         if not Sniffer().has_header(f.read(8192)):
  17             sys.stderr.write("ERROR: CSV file has no header!\n")
  18             sys.exit(2)
  19         f.seek(0)
  20
  21         # Parse header of the input file
  22         header = csv.next()
  23         try:
  24             sort_field = header.index(sortby)
  25         except ValueError:
  26             sys.stderr.write("ERROR: Cannot find column '%s' in " % sortby +
  27                              "CSV file!\n")
  28             sys.exit(2)
  29         try:
  30             feat_field = header.index(feat)
  31         except ValueError:
  32             sys.stderr.write("ERROR: Cannot find column '%s' " % str(feat) +
  33                              "in CSV file!\n")
  34             sys.stderr.write("These are the values that were found: " +
  35                              "%s\n" % ", ".join(header))
  36             sys.exit(2)
  37
  38         # Parse input file and build datastructure
  39         for row in csv:
  40             if row[sort_field] not in pkts:
  41                 pkts[row[sort_field]] = dict()
  42             if row[feat_field] not in pkts[row[sort_field]]:
  43                 pkts[row[sort_field]][row[feat_field]] = 1
  44             else:
  45                 pkts[row[sort_field]][row[feat_field]] += 1
  46
  47     # Calculate multimodality estimation and write to output
  48     csvfile = open(output, 'wb')
  49     csvwriter = writer(csvfile, delimiter=',', quotechar='"',
  50                        quoting=QUOTE_MINIMAL)
  51     csvwriter.writerow([sortby, 'Number of packets',
  52                         'Number of states', 'Principal symbols'])
  53
  54     for src in pkts.iterkeys():
  55         tmp_max = [pkts[src][feat] for feat in pkts[src].iterkeys()]
  56         tmp_sum = map(lambda x: pow(x, 2), tmp_max)
  57         multimod = float(sum(tmp_sum)) / pow(max(tmp_max), 2)
  58         csvwriter.writerow([src, sum(tmp_max), len(pkts[src]), multimod])
  59     csvfile.close()
  60
  61
  62 def _main():
  63     # Argument handling
  64     parser = ArgumentParser(description="Estimate principal symbols.",
  65                             epilog='Example: %s --input ' % sys.argv[0] +
  66                                     'csv_file.csv --feature "Time to live" ' +
  67                                     '--sort-by \\ "Source" --output test.csv')
  68     parser.add_argument("--input", type=str, required=True,
  69                         help="Input CSV file.")
  70     parser.add_argument("--output", type=str, required=True,
  71                         help="Output CSV file.")
  72     parser.add_argument("--feature", type=str, required=True,
  73                         help="Feature to aggregate.")
  74     parser.add_argument("--sort-by", type=str, default='Source',
  75                         choices=['Source', 'Destination'], required=False,
  76                         help="Sort by source or destination IP address.")
  77     args = parser.parse_args()
  78     if not exists(args.input):
  79         sys.stderr.write("ERROR: Input file '%s' " % args.input +
  80                          "does not exist!\n")
  81         sys.exit(2)
  82     if exists(args.output):
  83         sys.stderr.write("ERROR: Output file '%s' " % args.output +
  84                          "already exists!\n")
  85         sys.exit(2)
  86     principal_symbols(args.input, args.feature, args.sort_by, args.output)
  87
  88
  89 if __name__ == '__main__':
  90     _main()