Browse Source

Bugfix: duplicate lines are now detected and not lost anymore.

pitchum 7 years ago
parent
commit
4f4570cabf
1 changed files with 11 additions and 0 deletions
  1. 11 0
      ccoop_resplit.py

+ 11 - 0
ccoop_resplit.py

@@ -23,6 +23,7 @@ class CsvStatementParser(object):
         self.first_ops = {}
         self.first_ops = {}
         self.last_ops = {}
         self.last_ops = {}
         self.daterange = [datetime.now(), datetime.fromordinal(1)]
         self.daterange = [datetime.now(), datetime.fromordinal(1)]
+        self.dups = dict() # holds counters for duplicate lines
     
     
     
     
     def parse(self, filename):
     def parse(self, filename):
@@ -51,10 +52,20 @@ class CsvStatementParser(object):
     
     
     
     
     def _parse_file(self, filename, reader):
     def _parse_file(self, filename, reader):
+        self.dups = dict() # Duplicate counters must be reset for each file
         print("Lecture du fichier %s" % os.path.basename(filename))
         print("Lecture du fichier %s" % os.path.basename(filename))
         for row in reader:
         for row in reader:
             opdate = datetime.strptime(row[self.date_fieldname], '%d/%m/%Y')
             opdate = datetime.strptime(row[self.date_fieldname], '%d/%m/%Y')
             ophash = datetime.strftime(opdate, '%Y-%m-%d') + hashlib.md5(json.dumps(row).encode()).hexdigest()
             ophash = datetime.strftime(opdate, '%Y-%m-%d') + hashlib.md5(json.dumps(row).encode()).hexdigest()
+            # Special use case: one file contains multiple identical lines.
+            # Then we append a counter to the duplicate ophash.
+            if ophash in self.lines:
+                print("*** Duplicate line found in {}: {}".format(filename, ';'.join(row.values())))
+                if ophash not in self.dups:
+                    self.dups[ophash] = 1
+                self.dups[ophash] = self.dups[ophash] + 1
+                ophash = ophash + "-" + str(self.dups[ophash])
+                # print("   We have now :\n  {}\n  {}".format("\n  ".join([h + "   // " + "".join(v.values()) for h,v in self.lines.items() if h.startswith(ophash[0:10])]), ophash + " // " + "".join(row.values()))) # XXX DEBUG
             self.lines[ophash] = {k:v for k,v in row.items() if k != ''}
             self.lines[ophash] = {k:v for k,v in row.items() if k != ''}
             # Adjust dateranges
             # Adjust dateranges
             if opdate < self.daterange[0]:
             if opdate < self.daterange[0]: