ccoop_resplit.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. #! /usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. import os, sys, json
  4. import csv
  5. from collections import OrderedDict
  6. from datetime import datetime
  7. import hashlib
  8. import locale
  9. locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')
  10. class CsvStatementParser(object):
  11. def __init__(self):
  12. self.lines = OrderedDict()
  13. self.fieldnames = None
  14. self.date_fieldname = "Date"
  15. self.overlap_detector = {}
  16. self.first_ops = {}
  17. self.last_ops = {}
  18. self.daterange = [datetime.now(), datetime.fromordinal(1)]
  19. def parse(self, filename):
  20. with open(filename, encoding='iso-8859-1') as csvfile:
  21. reader = csv.DictReader(csvfile, delimiter=';')
  22. if self.fieldnames is None:
  23. # Le premier fichier parcourru détermine les noms de
  24. # colonnes attendus dans les prochains fichiers.
  25. self.fieldnames = [k for k in reader.fieldnames if k != '']
  26. # On identifie également la permière colonne qui
  27. # ressemble à une date, elle servira ensuite de clef
  28. # d'indexation.
  29. for fname in self.fieldnames:
  30. if "date" in fname.lower():
  31. self.date_fieldname = fname
  32. break
  33. if self.fieldnames != [k for k in reader.fieldnames if k != '']:
  34. print("""Fichier ignoré : %s. Cause: does not have the expected column names.
  35. Found: %s
  36. Expected: %s
  37. """ % (filename, ",".join(reader.fieldnames), ",".join(self.fieldnames)))
  38. else:
  39. self._parse_file(filename, reader)
  40. def _parse_file(self, filename, reader):
  41. print("Lecture du fichier %s" % os.path.basename(filename))
  42. for row in reader:
  43. opdate = datetime.strptime(row[self.date_fieldname], '%d/%m/%Y')
  44. ophash = datetime.strftime(opdate, '%Y-%m-%d') + hashlib.md5(json.dumps(row).encode()).hexdigest()
  45. self.lines[ophash] = {k:v for k,v in row.items() if k != ''}
  46. # Adjust dateranges
  47. if opdate < self.daterange[0]:
  48. self.daterange[0] = opdate
  49. if opdate > self.daterange[1]:
  50. self.daterange[1] = opdate
  51. # Prepare overlap detection
  52. if ophash not in self.overlap_detector:
  53. self.overlap_detector[ophash] = set()
  54. self.overlap_detector[ophash].add(filename)
  55. # Remember first line of each CSV file
  56. if filename not in self.first_ops:
  57. self.first_ops[filename] = ophash
  58. # Remember first line of each CSV file
  59. if filename not in self.last_ops:
  60. self.last_ops[filename] = ophash
  61. # CSV files are sometimes sorted by date ASC and sometimes
  62. # sorted by date DESC. So we may need to swap first_op and last_op.
  63. if (int(self.first_ops[filename][0:10].replace('-', '')) > int(self.last_ops[filename][0:10].replace('-', ''))):
  64. tmp = self.first_ops[filename]
  65. self.first_ops[filename] = self.last_ops[filename]
  66. self.last_ops[filename] = tmp
  67. def dump_full(self, output_filename):
  68. with open(output_filename, 'w') as outfile:
  69. writer = csv.DictWriter(outfile, self.fieldnames, delimiter=';')
  70. writer.writeheader()
  71. for line in reversed(sorted(self.lines.items())):
  72. writer.writerow(line[1])
  73. print("Relevé intégral généré dans le fichier %s" % os.path.abspath(output_filename))
  74. def dump_monthly_reports(self, outputdir):
  75. firstmonth = int('{:%Y%m}'.format(self.daterange[0])) + 1
  76. lastmonth = int('{:%Y%m}'.format(self.daterange[1])) - 1
  77. if firstmonth >= lastmonth:
  78. print("Impossible de générer des relevés mensuels car la plage de dates traitée est trop petite.")
  79. return
  80. curmonth = firstmonth
  81. def __openfile__(curmonth):
  82. dt = datetime.strptime(str(curmonth), '%Y%m')
  83. # fname = "releve_{0}__{1:_<5}_{2}.csv".format(curmonth, dt.strftime('%b'), dt.strftime('%Y'))
  84. fname = "releve_{0}.csv".format(curmonth)
  85. outfile = open(os.path.join(outputdir, fname), 'w')
  86. writer = csv.DictWriter(outfile, self.fieldnames, delimiter=';')
  87. return outfile, writer
  88. outfile, writer = __openfile__(curmonth)
  89. writer = csv.DictWriter(outfile, self.fieldnames, delimiter=';')
  90. for line in sorted(self.lines.items()):
  91. month = int(line[0][0:4] + line[0][5:7])
  92. if month < curmonth:
  93. continue
  94. if month > lastmonth:
  95. break
  96. if month > curmonth:
  97. outfile.close()
  98. curmonth = month
  99. outfile, writer = __openfile__(curmonth)
  100. writer.writerow(line[1])
  101. outfile.close()
  102. print("Relevés mensuels générés dans le dossier %s" % os.path.abspath(outputdir))
  103. def check_overlaps(self):
  104. """
  105. Helps finding possible missing operations if exported CSV files
  106. are not "contiguous".
  107. """
  108. print("\nRecherche de chevauchements (les chevauchements de fichiers CSV c'est bien, ça rassure)...")
  109. for filename, first_op in self.first_ops.items():
  110. if first_op in self.overlap_detector:
  111. otherfiles = [v for v in self.overlap_detector.get(first_op)]
  112. otherfiles.remove(filename)
  113. if len(otherfiles) > 0:
  114. # Eliminate files having the same first_op
  115. for candidate in otherfiles:
  116. if self.first_ops[candidate] == first_op:
  117. otherfiles.remove(candidate)
  118. if len(otherfiles) == 0:
  119. print("Attention. Il y a peut-être des écritures manquantes avant le %s (fichier %s)." % (first_op[0:10], os.path.basename(filename)))
  120. for filename, last_op in self.last_ops.items():
  121. if last_op in self.overlap_detector:
  122. otherfiles = [v for v in self.overlap_detector.get(last_op)]
  123. otherfiles.remove(filename)
  124. if len(otherfiles) > 0:
  125. # Eliminate files having the same last_op
  126. for candidate in otherfiles:
  127. if self.last_ops[candidate] == last_op:
  128. otherfiles.remove(candidate)
  129. if len(otherfiles) == 0:
  130. print("Attention. Il y a peut-être des écritures manquantes après le %s (fichier %s)." % (last_op[0:10], os.path.basename(filename)))
  131. print("")
  132. def start_cli(dirpath):
  133. # Lecture des fichiers CSV présents dans le dossier
  134. p = CsvStatementParser()
  135. for f in sorted(os.listdir(dirpath)):
  136. if f.endswith('.csv') or f.endswith('.CSV'):
  137. p.parse(os.path.join(dirpath, f))
  138. print("Les écritures lues s'étalent entre le {0:%d %B %Y} et le {1:%d %B %Y}.".format(p.daterange[0], p.daterange[1]))
  139. # Recherche de chevauchements
  140. p.check_overlaps()
  141. # Création d'un dossier pour stocker les fichiers générés
  142. outputdir = os.path.join(dirpath, "output")
  143. if not os.path.isdir(outputdir):
  144. os.makedirs(outputdir)
  145. # Générer un relevé intégral et des relevés mensuels
  146. p.dump_full(os.path.join(outputdir, "integral.csv"))
  147. p.dump_monthly_reports(outputdir)
  148. if __name__ == '__main__':
  149. if len(sys.argv) < 2:
  150. print("Erreur. Merci de préciser le chemin du dossier où se trouvent les fichiers CSV à analyser.")
  151. sys.exit(1)
  152. inputdir = sys.argv[1]
  153. start_cli(inputdir)