recordings can now be arbitrarily arranged in the groundtruth folder

bjarthur · bjarthur · commit a4a3ebd14d89 · 2024-07-25T17:15:55.000-04:00
diff --git a/src/accuracy b/src/accuracy
@@ -163,7 +163,7 @@ def doit(logdir, key_to_plot, ckpt, labels, nprobabilities, error_ratios, loss,
   if loss=='exclusive':
       for subdir in set([x['file'][0] for x in validation_sounds]):
           with open(os.path.join(logdir, key_to_plot, 'predictions.ckpt-'+str(ckpt), \
-                                 subdir+'-mistakes.csv'), \
+                                 subdir.replace(os.path.sep,'-')+'-mistakes.csv'), \
                     'w', newline='') as csvfile:
             csvwriter = csv.writer(csvfile, lineterminator='\n')
             for i in range(len(validation_sounds)):
diff --git a/src/congruence b/src/congruence
@@ -4,6 +4,7 @@
 
 # e.g. congruence \
 #     --basepath=/groups/stern/sternlab/behavior/arthurb/groundtruth/kyriacou2017 \
+#     --topath=/groups/stern/sternlab/behavior/arthurb/groundtruth/kyriacou2017/congruence-20240718T091400 \
 #     --wavfiles=PS_20130625111709_ch3.wav,PS_20130625111709_ch7.wav \
 #     --portion=union \
 #     --convolve_ms=0 \
@@ -146,11 +147,17 @@ def main():
   convolve_tic = int(FLAGS.convolve_ms/2/1000*FLAGS.audio_tic_rate)
 
   wavdirs = {}
-  for subdir in filter(lambda x: os.path.isdir(os.path.join(FLAGS.basepath,x)), \
-                       os.listdir(FLAGS.basepath)):
-    commonsubfiles = wavfiles & set(os.listdir(os.path.join(FLAGS.basepath, subdir)))
-    if len(commonsubfiles) > 0:
-      wavdirs[subdir] = commonsubfiles
+  def traverse(curdir):
+      entries = set()
+      for entry in os.listdir(os.path.join(FLAGS.basepath, curdir)):
+          if os.path.isdir(os.path.join(FLAGS.basepath, curdir, entry)):
+              traverse(os.path.join(curdir, entry))
+          else:
+              entries |= set([entry])
+      commonsubfiles = wavfiles & entries
+      if len(commonsubfiles) > 0:
+        wavdirs[curdir] = commonsubfiles
+  traverse("")
 
   labels=None
   temp_files=[]
@@ -209,6 +216,7 @@ def main():
   annotator_keys = set()
 
   for wavdir in wavdirs:
+    os.makedirs(os.path.join(FLAGS.topath, wavdir), exist_ok=True)
     for csvfile in filter(lambda x: ("-annotated-" in x or "-predicted-" in x) and
                                     x.endswith('.csv'), \
                           os.listdir(os.path.join(FLAGS.basepath,wavdir))):
@@ -468,31 +476,31 @@ def main():
       if do_tic:
         fig_tic.tight_layout()
         plt.figure(fig_tic.number)
-        plt.savefig(os.path.join(FLAGS.basepath, 'congruence.tic.'+label+'.'+pr+'.pdf'))
+        plt.savefig(os.path.join(FLAGS.topath, 'congruence.tic.'+label+'.'+pr+'.pdf'))
         plt.close()
       if do_label:
         fig_label.tight_layout()
         plt.figure(fig_label.number)
-        plt.savefig(os.path.join(FLAGS.basepath, 'congruence.label.'+label+'.'+pr+'.pdf'))
+        plt.savefig(os.path.join(FLAGS.topath, 'congruence.label.'+label+'.'+pr+'.pdf'))
         plt.close()
       if len(sorted_hm)<4:
         if do_tic:
           fig_tic_venn.tight_layout()
           plt.figure(fig_tic_venn.number)
-          plt.savefig(os.path.join(FLAGS.basepath, 'congruence.tic.'+label+'.'+pr+'-venn.pdf'))
+          plt.savefig(os.path.join(FLAGS.topath, 'congruence.tic.'+label+'.'+pr+'-venn.pdf'))
           plt.close()
         if do_label:
           fig_label_venn.tight_layout()
           plt.figure(fig_label_venn.number)
-          plt.savefig(os.path.join(FLAGS.basepath, 'congruence.label.'+label+'.'+pr+'-venn.pdf'))
+          plt.savefig(os.path.join(FLAGS.topath, 'congruence.label.'+label+'.'+pr+'-venn.pdf'))
           plt.close()
 
   if FLAGS.parallelize!=0:
     pool.close()
 
   def to_csv(intervals, csvbase, whichset):
     filename = os.path.splitext(csvbase)[0]+'-disjoint-'+whichset+'.csv'
-    with open(os.path.join(FLAGS.basepath,filename), 'w') as fid:
+    with open(os.path.join(FLAGS.topath,filename), 'w') as fid:
       csvwriter = csv.writer(fid, lineterminator='\n')
       for ilabel,label in enumerate(timestamps.keys()):
         for i in intervals[ilabel]:
@@ -627,7 +635,7 @@ def main():
         ax2.legend(loc=(1.05, 0.0))
         ax1.legend(loc=(1.2, 0.1))
         fig.tight_layout()
-        plt.savefig(os.path.join(FLAGS.basepath,'congruence.'+measure+'.'+label+'.pdf'))
+        plt.savefig(os.path.join(FLAGS.topath,'congruence.'+measure+'.'+label+'.pdf'))
         plt.close()
 
         inotnan = (~np.isnan(P) & ~np.isnan(R)).nonzero()[0]
@@ -637,7 +645,7 @@ def main():
         else:
           print(measure+' '+label+' area cannot be computed because recall is not monotonic')
 
-      with open(os.path.join(FLAGS.basepath,'congruence.'+measure+'.'+label+'.csv'), 'w') as fid:
+      with open(os.path.join(FLAGS.topath,'congruence.'+measure+'.'+label+'.csv'), 'w') as fid:
         csvwriter = csv.writer(fid, lineterminator='\n')
         rows = roc_table[label].keys()
         cols = roc_table[label][next(iter(rows-thresholds))].keys()
@@ -671,6 +679,9 @@ if __name__ == "__main__":
   parser.add_argument(
       '--basepath',
       type=str)
+  parser.add_argument(
+      '--topath',
+      type=str)
   parser.add_argument(
       '--wavfiles',
       type=str)
diff --git a/src/data.py b/src/data.py
@@ -199,16 +199,19 @@ def prepare_data_index(self,
         video_frame_height = model_settings['video_frame_height']
         video_channels = model_settings['video_channels']
         shiftby_tics = int(shiftby_ms * model_settings["audio_tic_rate"] / 1000)
-        search_path = os.path.join(self.data_dir, '*', '*.csv')
         audio_ntics = {}
         video_nframes = {}
         subsample = {x:int(y) for x,y in zip(subsample_label.split(','),subsample_skip.split(','))
                               if x != ''}
         partition_labels = partition_label.split(',')
         if '' in partition_labels:
             partition_labels.remove('')
-        for csv_path in glob(search_path):
-            with (open(csv_path, 'r')) as csv_file:
+        for csv_path in glob("**/*.csv", root_dir=self.data_dir, recursive=True):
+            csv_dir = os.path.dirname(csv_path)
+            if re.fullmatch('congruence-[0-9]{8}T[0-9]{6}', csv_dir) or \
+               re.fullmatch('oldfiles-[0-9]{8}T[0-9]{6}', csv_dir):
+                continue
+            with (open(os.path.join(self.data_dir, csv_path), 'r')) as csv_file:
                 annotation_reader = csv.reader(csv_file)
                 annotation_list = list(annotation_reader)
             if len(partition_labels)>0:
@@ -226,8 +229,8 @@ def prepare_data_index(self,
                 if (label if loss=='exclusive' else
                     label.removeprefix(overlapped_prefix)) not in labels_touse:
                     continue
-                wav_path=os.path.join(os.path.dirname(csv_path),wavfile)
-                wav_base2=[os.path.basename(os.path.dirname(csv_path)), wavfile]
+                wav_path = os.path.join(self.data_dir, os.path.dirname(csv_path), wavfile)
+                wav_base2 = [os.path.dirname(csv_path), wavfile]
                 if wavfile in validation_files:
                     set_index = 'validation'
                 elif wavfile in testing_files:
diff --git a/src/gui/controller.py b/src/gui/controller.py
@@ -1027,16 +1027,6 @@ async def misses_actuate():
                                             misses_succeeded(w, t)))
     asyncio.create_task(actuate_finalize(threads, results, V.groundtruth_update))
 
-def isoldfile(x,subdir,basewavs):
-    return \
-        np.any([x.startswith(b+'-') and x.endswith('.wav') for b in basewavs]) or \
-        x.endswith('-classify.log') or \
-        '-predicted' in x or \
-        x.endswith('-ethogram.log') or \
-        '-missed' in x or \
-        x.endswith('-misses.log') or \
-        x == subdir+'.csv'
-
 def _validation_test_files(files_string, comma=True):
     if files_string.rstrip(os.sep) == V.groundtruth_folder.value.rstrip(os.sep):
         dfs = []
@@ -1137,31 +1127,37 @@ def sequester_stalefiles():
     M.annotated_csvfiles_all=set([])
     for button in V.nsounds_per_label_buttons:
         button.label = str(0)
-    for subdir in filter(lambda x: os.path.isdir(os.path.join(V.groundtruth_folder.value,x)), \
-                         os.listdir(V.groundtruth_folder.value)):
+
+    def isoldfile(x,curdir,basewavs):
+        return \
+            np.any([x.startswith(b+'-') and x.endswith('.wav') for b in basewavs]) or \
+            x.endswith('-classify.log') or \
+            '-predicted' in x or \
+            x.endswith('-ethogram.log') or \
+            '-missed' in x or \
+            x.endswith('-misses.log') or \
+            x == curdir+'.csv'
+
+    def _sequester(curdir):
         dfs = []
-        for csvfile in filter(lambda x: '-annotated-' in x and x.endswith('.csv'), \
-                              os.listdir(os.path.join(V.groundtruth_folder.value, \
-                                                      subdir))):
-            filepath = os.path.join(V.groundtruth_folder.value, subdir, csvfile)
-            if os.path.getsize(filepath) > 0:
-                dfs.append(pd.read_csv(filepath, header=None, index_col=False))
+        for entry in os.listdir(curdir):
+            if os.path.isdir(os.path.join(curdir, entry)):
+                _sequester(os.path.join(curdir, entry))
+            elif '-annotated-' in entry and entry.endswith('.csv'):
+                filepath = os.path.join(curdir, entry)
+                if os.path.getsize(filepath) > 0:
+                    dfs.append(pd.read_csv(filepath, header=None, index_col=False))
         if dfs:
             df = pd.concat(dfs)
             basewavs = set([os.path.splitext(x)[0] for x in df[0]])
-            oldfiles = []
-            for oldfile in filter(lambda x: isoldfile(x,subdir,basewavs), \
-                                  os.listdir(os.path.join(V.groundtruth_folder.value, \
-                                                          subdir))):
-                oldfiles.append(oldfile)
+            oldfiles = [x for x in os.listdir(curdir) if isoldfile(x, curdir, basewavs)]
             if len(oldfiles)>0:
-                topath = os.path.join(V.groundtruth_folder.value, \
-                                      subdir, \
-                                      'oldfiles-'+M.songexplorer_starttime)
+                topath = os.path.join(curdir, 'oldfiles-'+M.songexplorer_starttime)
                 os.mkdir(topath)
                 for oldfile in oldfiles:
-                    os.rename(os.path.join(V.groundtruth_folder.value, subdir, oldfile), \
-                              os.path.join(topath, oldfile))
+                    os.rename(os.path.join(curdir, oldfile), os.path.join(topath, oldfile))
+
+    _sequester(V.groundtruth_folder.value)
     V.groundtruth_update()
 
 async def train_actuate():
@@ -2006,14 +2002,19 @@ async def congruence_actuate():
     all_files = validation_files + test_files
     if '' in all_files:
         all_files.remove('')
-    logfile = os.path.join(V.groundtruth_folder.value,'congruence.log')
+    timestamp = datetime.strftime(datetime.now(),'%Y%m%dT%H%M%S')
+    congruence_folder = os.path.join(V.groundtruth_folder.value, 'congruence-'+timestamp)
+    os.mkdir(congruence_folder)
+    logfile = os.path.join(congruence_folder, 'congruence.log')
     jobid = generic_actuate("congruence", logfile,
                             M.congruence_where,
                             M.congruence_ncpu_cores,
                             M.congruence_ngpu_cards,
                             M.congruence_ngigabytes_memory,
                             M.congruence_cluster_flags,
                             "--basepath="+V.groundtruth_folder.value,
+                            "--topath="+os.path.join(V.groundtruth_folder.value,
+                                                     'congruence-'+timestamp),
                             "--wavfiles="+','.join(all_files),
                             "--portion="+V.congruence_portion.value,
                             "--convolve_ms="+V.congruence_convolve.value,
@@ -2031,7 +2032,7 @@ async def congruence_actuate():
     threads[0] = asyncio.create_task(actuate_monitor(displaystring, results, 0, \
                                      lambda l=logfile, t=currtime: recent_file_exists(l, t, False), \
                                      lambda l=logfile: contains_two_timestamps(l), \
-                                     lambda l=V.groundtruth_folder.value, t=currtime, r=regex_files,
+                                     lambda l=congruence_folder, t=currtime, r=regex_files,
                                             m=V.congruence_measure.value: congruence_succeeded(l, t, r, m)))
     asyncio.create_task(actuate_finalize(threads, results, V.groundtruth_update))
 
diff --git a/src/gui/view.py b/src/gui/view.py
@@ -1398,21 +1398,27 @@ def labelcounts_update():
     if not os.path.isdir(groundtruth_folder.value):
         labelcounts.text = ""
         return dfs, subdirs
-    for subdir in filter(lambda x: os.path.isdir(os.path.join(groundtruth_folder.value,x)), \
-                         os.listdir(groundtruth_folder.value)):
-        for csvfile in filter(lambda x: x.endswith('.csv'), \
-                              os.listdir(os.path.join(groundtruth_folder.value, subdir))):
-            filepath = os.path.join(groundtruth_folder.value, subdir, csvfile)
-            if os.path.getsize(filepath) > 0:
-                try:
-                    df = pd.read_csv(filepath, header=None, index_col=False)
-                except:
-                    bokehlog.info("WARNING: "+csvfile+" is not in the correct format")
-                if 5<=len(df.columns)<=6:
-                    dfs.append(df)
-                    subdirs.append(subdir)
-                else:
-                    bokehlog.info("WARNING: "+csvfile+" is not in the correct format")
+
+    def _labelcounts_update(curdir):
+        for entry in os.listdir(curdir):
+            if os.path.isdir(os.path.join(curdir, entry)):
+                timestamp = datetime.strftime(datetime.now(),'%Y')
+                if "congruence-"+timestamp not in entry and "oldfiles-"+timestamp not in entry:
+                    _labelcounts_update(os.path.join(curdir, entry))
+            elif entry.endswith('.csv'):
+                filepath = os.path.join(curdir, entry)
+                if os.path.getsize(filepath) > 0:
+                    try:
+                        df = pd.read_csv(filepath, header=None, index_col=False)
+                    except:
+                        bokehlog.info("WARNING: "+entry+" is not in the correct format")
+                    if 5<=len(df.columns)<=6:
+                        dfs.append(df)
+                        subdirs.append(curdir[len(groundtruth_folder.value):])
+                    else:
+                        bokehlog.info("WARNING: "+entry+" is not in the correct format")
+    _labelcounts_update(groundtruth_folder.value)
+
     if dfs:
         df = pd.concat(dfs)
         M.kinds = sorted(set(df[3]))
diff --git a/test/runtests b/test/runtests
@@ -51,12 +51,12 @@ if os.name == "posix":
              os.path.join("nfeaturesexclusive-32", "xvalidate_1k", "thresholds.ckpt-300.csv"),
              os.path.join("nfeaturesexclusive-64", "xvalidate_1k", "thresholds.ckpt-30.csv"),
              os.path.join("nfeaturesexclusive-64", "xvalidate_1k", "thresholds.ckpt-300.csv"),
-             os.path.join("groundtruth-data", "congruence.tic.ambient.csv"),
-             os.path.join("groundtruth-data", "congruence.tic.mel-pulse.csv"),
-             os.path.join("groundtruth-data", "congruence.tic.mel-sine.csv"),
-             os.path.join("groundtruth-data", "congruence.label.ambient.csv"),
-             os.path.join("groundtruth-data", "congruence.label.mel-pulse.csv"),
-             os.path.join("groundtruth-data", "congruence.label.mel-sine.csv")
+             os.path.join("groundtruth-data", "congruence-11112233T445566", "congruence.tic.ambient.csv"),
+             os.path.join("groundtruth-data", "congruence-11112233T445566", "congruence.tic.mel-pulse.csv"),
+             os.path.join("groundtruth-data", "congruence-11112233T445566", "congruence.tic.mel-sine.csv"),
+             os.path.join("groundtruth-data", "congruence-11112233T445566", "congruence.label.ambient.csv"),
+             os.path.join("groundtruth-data", "congruence-11112233T445566", "congruence.label.mel-pulse.csv"),
+             os.path.join("groundtruth-data", "congruence-11112233T445566", "congruence.label.mel-sine.csv")
              ]
     for file in files:
       if not cmp(os.path.join(repo_path, "test", "scratch", "tutorial-sh", file),
diff --git a/test/tutorial.py b/test/tutorial.py
diff --git a/test/tutorial.sh b/test/tutorial.sh