ccoop_resplit.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. #! /usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. import os, sys, json
  4. import csv
  5. from collections import OrderedDict
  6. from datetime import datetime
  7. import hashlib
  8. import locale
  9. locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')
  10. class CsvStatementParser(object):
  11. def __init__(self):
  12. self.lines = OrderedDict()
  13. self.fieldnames = None
  14. self.date_fieldname = "Date"
  15. self.overlap_detector = {}
  16. self.first_ops = {}
  17. self.last_ops = {}
  18. self.daterange = [datetime.now(), datetime.fromordinal(1)]
  19. def parse(self, filename):
  20. with open(filename, encoding='iso-8859-1') as csvfile:
  21. reader = csv.DictReader(csvfile, delimiter=';')
  22. if self.fieldnames is None:
  23. # Le premier fichier parcourru détermine les noms de
  24. # colonnes attendus dans les prochains fichiers.
  25. self.fieldnames = [k for k in reader.fieldnames if k != '']
  26. # On identifie également la permière colonne qui
  27. # ressemble à une date, elle servira ensuite de clef
  28. # d'indexation.
  29. for fname in self.fieldnames:
  30. if "date" in fname.lower():
  31. self.date_fieldname = fname
  32. break
  33. if self.fieldnames != [k for k in reader.fieldnames if k != '']:
  34. print("""Fichier ignoré : %s. Cause: does not have the expected column names.
  35. Found: %s
  36. Expected: %s
  37. """ % (filename, ",".join(reader.fieldnames), ",".join(self.fieldnames)))
  38. else:
  39. self._parse_file(filename, reader)
  40. def _parse_file(self, filename, reader):
  41. print("Lecture du fichier %s" % os.path.basename(filename))
  42. for row in reader:
  43. opdate = datetime.strptime(row[self.date_fieldname], '%d/%m/%Y')
  44. ophash = datetime.strftime(opdate, '%Y-%m-%d') + hashlib.md5(json.dumps(row).encode()).hexdigest()
  45. self.lines[ophash] = {k:v for k,v in row.items() if k != ''}
  46. # Adjust dateranges
  47. if opdate < self.daterange[0]:
  48. self.daterange[0] = opdate
  49. if opdate > self.daterange[1]:
  50. self.daterange[1] = opdate
  51. # Prepare overlap detection
  52. if ophash not in self.overlap_detector:
  53. self.overlap_detector[ophash] = set()
  54. self.overlap_detector[ophash].add(filename)
  55. # Remember first line of each CSV file
  56. if filename not in self.first_ops:
  57. self.first_ops[filename] = ophash
  58. # Remember first line of each CSV file
  59. if filename not in self.last_ops:
  60. self.last_ops[filename] = ophash
  61. def dump_full(self, output_filename):
  62. with open(output_filename, 'w') as outfile:
  63. writer = csv.DictWriter(outfile, self.fieldnames, delimiter=';')
  64. writer.writeheader()
  65. for line in reversed(sorted(self.lines.items())):
  66. writer.writerow(line[1])
  67. print("Relevé intégral généré dans le fichier %s" % os.path.abspath(output_filename))
  68. def dump_monthly_reports(self, outputdir):
  69. firstmonth = int('{:%Y%m}'.format(self.daterange[0])) + 1
  70. lastmonth = int('{:%Y%m}'.format(self.daterange[1])) - 1
  71. if firstmonth >= lastmonth:
  72. print("Impossible de générer des relevés mensuels car la plage de dates traitée est trop petite.")
  73. return
  74. curmonth = firstmonth
  75. def __openfile__(curmonth):
  76. dt = datetime.strptime(str(curmonth), '%Y%m')
  77. # fname = "releve_{0}__{1:_<5}_{2}.csv".format(curmonth, dt.strftime('%b'), dt.strftime('%Y'))
  78. fname = "releve_{0}.csv".format(curmonth)
  79. outfile = open(os.path.join(outputdir, fname), 'w')
  80. writer = csv.DictWriter(outfile, self.fieldnames, delimiter=';')
  81. return outfile, writer
  82. outfile, writer = __openfile__(curmonth)
  83. writer = csv.DictWriter(outfile, self.fieldnames, delimiter=';')
  84. for line in sorted(self.lines.items()):
  85. month = int(line[0][0:4] + line[0][5:7])
  86. if month < curmonth:
  87. continue
  88. if month > lastmonth:
  89. break
  90. if month > curmonth:
  91. outfile.close()
  92. curmonth = month
  93. outfile, writer = __openfile__(curmonth)
  94. writer.writerow(line[1])
  95. outfile.close()
  96. print("Relevés mensuels générés dans le dossier %s" % os.path.abspath(outputdir))
  97. def check_overlaps(self):
  98. """
  99. Helps finding possible missing operations if exported CSV files
  100. are not "contiguous".
  101. """
  102. print("\nRecherche de chevauchements (les chevauchements de fichiers CSV c'est bien, ça rassure)...")
  103. for filename, first_op in self.first_ops.items():
  104. if first_op in self.overlap_detector:
  105. otherfiles = [v for v in self.overlap_detector.get(first_op)]
  106. otherfiles.remove(filename)
  107. if len(otherfiles) > 0:
  108. # Eliminate files having the same first_op
  109. for candidate in otherfiles:
  110. if self.first_ops[candidate] == first_op:
  111. otherfiles.remove(candidate)
  112. if len(otherfiles) == 0:
  113. print("Attention. Il y a peut-être des écritures manquantes après le %s (fichier %s)." % (first_op[0:10], os.path.basename(filename)))
  114. for filename, last_op in self.last_ops.items():
  115. if last_op in self.overlap_detector:
  116. otherfiles = [v for v in self.overlap_detector.get(last_op)]
  117. otherfiles.remove(filename)
  118. if len(otherfiles) > 0:
  119. # Eliminate files having the same last_op
  120. for candidate in otherfiles:
  121. if self.last_ops[candidate] == last_op:
  122. otherfiles.remove(candidate)
  123. if len(otherfiles) == 0:
  124. print("Attention. Il y a peut-être des écritures manquantes avant le %s (fichier %s)." % (last_op[0:10], os.path.basename(filename)))
  125. print("")
  126. def start_cli(dirpath):
  127. # Lecture des fichiers CSV présents dans le dossier
  128. p = CsvStatementParser()
  129. for f in sorted(os.listdir(dirpath)):
  130. if f.endswith('.csv') or f.endswith('.CSV'):
  131. p.parse(os.path.join(dirpath, f))
  132. print("Les écritures lues s'étalent entre le {0:%d %B %Y} et le {1:%d %B %Y}.".format(p.daterange[0], p.daterange[1]))
  133. # Recherche de chevauchements
  134. p.check_overlaps()
  135. # Création d'un dossier pour stocker les fichiers générés
  136. outputdir = os.path.join(dirpath, "output")
  137. if not os.path.isdir(outputdir):
  138. os.makedirs(outputdir)
  139. # Générer un relevé intégral et des relevés mensuels
  140. p.dump_full(os.path.join(outputdir, "integral.csv"))
  141. p.dump_monthly_reports(outputdir)
  142. if __name__ == '__main__':
  143. if len(sys.argv) < 2:
  144. print("Erreur. Merci de préciser le chemin du dossier où se trouvent les fichiers CSV à analyser.")
  145. sys.exit(1)
  146. inputdir = sys.argv[1]
  147. start_cli(inputdir)