2 # -*- coding: utf-8 -*-
5 from csv import reader, Sniffer
6 from os.path import exists
7 from argparse import ArgumentParser
10 from rpy2.robjects.packages import importr
11 from rpy2.robjects.vectors import DataFrame, FloatVector
14 def _convert_to_dataframe(x):
15 """ Convert Python list of integers to R data frame. """
19 tmp['y'] = FloatVector(y)
23 def autocorrelation(f_in, field, source_ip, source_port, destination_ip,
27 with open(f_in, 'r') as f:
28 csv = reader(f, delimiter=',', quotechar='"')
30 # Make sure that CSV file includes a header.
31 if not Sniffer().has_header(f.read(8192)):
32 sys.stderr.write("ERROR: CSV file has no header!\n")
36 # Parse header of the input file
39 feat_field = header.index(field)
41 sys.stderr.write("ERROR: Cannot find column '%s' " % str(field) +
43 sys.stderr.write("These are the values that were found: " +
44 "%s\n" % ", ".join(header))
47 # Find field for source IP
50 source_field = header.index("Source")
52 sys.stderr.write("ERROR: Cannot find column 'Source' " +
54 sys.stderr.write("These are the values that were found: " +
55 "%s\n" % ", ".join(header))
58 # Find field for source port
61 source_p_field = header.index("Source port")
63 sys.stderr.write("ERROR: Cannot find column 'Source port' " +
65 sys.stderr.write("These are the values that were found: " +
66 "%s\n" % ", ".join(header))
69 # Find field for destination port
72 dest_p_field = header.index("Destination port")
74 sys.stderr.write("ERROR: Cannot find column " +
75 "'Destination port' in CSV file!\n")
76 sys.stderr.write("These are the values that were found: " +
77 "%s\n" % ", ".join(header))
80 # Find field for destination IP
83 dest_field = header.index("Destination")
85 sys.stderr.write("ERROR: Cannot find column 'Destination' " +
87 sys.stderr.write("These are the values that were found: " +
88 "%s\n" % ", ".join(header))
91 # Parse CSV and add values
94 if row[source_field] != source_ip:
97 if row[source_p_field] != source_port:
100 if row[dest_field] != destination_ip:
103 if row[dest_p_field] != destination_port:
105 values.append(row[feat_field])
108 print "ERROR: No values match your filtering criteria."
111 if len(set(values)) == 1:
112 print "All values are identical (perfect self-correlation)."
115 r_stats = importr('stats')
116 acf = r_stats.acf(_convert_to_dataframe(values), plot=False)
123 # ignore autocorrelation 0
127 # we want only absolute values
128 autocorr_sum += abs(autocorr[index])
130 # make sure we do not try to divide by zero
134 print autocorr_sum / (len(lag) - 1)
139 parser = ArgumentParser(description="Calculate sum of autocorrelation.",
140 epilog='Example: %s --input ' % sys.argv[0] +
141 'file.csv --field TTL ' +
142 '--source-ip \\ "192.168.0.1" ' +
143 '--destination-port 80')
144 parser.add_argument("--input", type=str, required=True,
145 help="Input CSV file.")
146 parser.add_argument("--field", type=str, required=True,
147 help="Field for which autocorrelation is calculated.")
148 parser.add_argument("--source-ip", type=str, required=False,
149 help="Restrict to a specific source IP address.")
150 parser.add_argument("--destination-ip", type=str, required=False,
151 help="Restrict to a specific destination IP address.")
152 parser.add_argument("--source-port", type=str, required=False,
153 help="Restrict to a specific source port.")
154 parser.add_argument("--destination-port", type=str, required=False,
155 help="Restrict to a specific destination port.")
157 args = parser.parse_args()
158 if not exists(args.input):
159 sys.stderr.write("ERROR: Input file '%s' " % args.input +
163 autocorrelation(args.input, args.field, args.source_ip, args.source_port,
164 args.destination_ip, args.destination_port)
167 if __name__ == '__main__':