Wauplin HF staff commited on
Commit
cb75609
1 Parent(s): 62a74cd

Clean duplicates in user history

Browse files
Files changed (1) hide show
  1. user_history.py +78 -112
user_history.py CHANGED
@@ -15,6 +15,7 @@ Useful links:
15
  - Source file: https://huggingface.co/spaces/Wauplin/gradio-user-history/blob/main/user_history.py
16
  - Discussions: https://huggingface.co/spaces/Wauplin/gradio-user-history/discussions
17
  """
 
18
  import json
19
  import os
20
  import shutil
@@ -37,8 +38,8 @@ def setup(folder_path: str | Path | None = None) -> None:
37
  user_history.folder_path = _resolve_folder_path(folder_path)
38
  user_history.initialized = True
39
 
40
- # TODO: remove this section once all Spaces have migrated
41
- _migrate_history()
42
 
43
 
44
  def render() -> None:
@@ -46,9 +47,7 @@ def render() -> None:
46
 
47
  # initialize with default config
48
  if not user_history.initialized:
49
- print(
50
- "Initializing user history with default config. Use `user_history.setup(...)` to customize folder_path."
51
- )
52
  setup()
53
 
54
  # Render user history tab
@@ -83,18 +82,11 @@ def render() -> None:
83
 
84
  # "Export zip" row (hidden by default)
85
  with gr.Row():
86
- export_file = gr.File(
87
- file_count="single",
88
- file_types=[".zip"],
89
- label="Exported history",
90
- visible=False,
91
- )
92
 
93
  # "Config deletion" row (hidden by default)
94
  with gr.Row():
95
- confirm_button = gr.Button(
96
- "Confirm delete all history", variant="stop", visible=False
97
- )
98
  cancel_button = gr.Button("Cancel", visible=False)
99
 
100
  # Gallery
@@ -117,12 +109,8 @@ def render() -> None:
117
  gallery.attach_load_event(_fetch_user_history, every=None)
118
 
119
  # Interactions
120
- refresh_button.click(
121
- fn=_fetch_user_history, inputs=[], outputs=[gallery], queue=False
122
- )
123
- export_button.click(
124
- fn=_export_user_history, inputs=[], outputs=[export_file], queue=False
125
- )
126
 
127
  # Taken from https://github.com/gradio-app/gradio/issues/3324#issuecomment-1446382045
128
  delete_button.click(
@@ -203,9 +191,7 @@ class _UserHistory(object):
203
 
204
  def _user_lock(self, username: str) -> FileLock:
205
  """Ensure history is not corrupted if concurrent calls."""
206
- return FileLock(
207
- self.folder_path / f"{username}.lock"
208
- ) # lock outside of folder => better when exporting ZIP
209
 
210
  def _user_jsonl_path(self, username: str) -> Path:
211
  return self._user_path(username) / "history.jsonl"
@@ -225,9 +211,7 @@ def _fetch_user_history(profile: gr.OAuthProfile | None) -> List[Tuple[str, str]
225
 
226
  user_history = _UserHistory()
227
  if not user_history.initialized:
228
- warnings.warn(
229
- "User history is not set in Gradio demo. You must use `user_history.render(...)` first."
230
- )
231
  return []
232
 
233
  with user_history._user_lock(username):
@@ -253,17 +237,13 @@ def _export_user_history(profile: gr.OAuthProfile | None) -> Dict | None:
253
 
254
  user_history = _UserHistory()
255
  if not user_history.initialized:
256
- warnings.warn(
257
- "User history is not set in Gradio demo. You must use `user_history.render(...)` first."
258
- )
259
  return None
260
 
261
  # Zip history
262
  with user_history._user_lock(username):
263
  path = shutil.make_archive(
264
- str(_archives_path() / f"history_{username}"),
265
- "zip",
266
- user_history._user_path(username),
267
  )
268
 
269
  return gr.update(visible=True, value=path)
@@ -278,9 +258,7 @@ def _delete_user_history(profile: gr.OAuthProfile | None) -> None:
278
 
279
  user_history = _UserHistory()
280
  if not user_history.initialized:
281
- warnings.warn(
282
- "User history is not set in Gradio demo. You must use `user_history.render(...)` first."
283
- )
284
  return
285
 
286
  with user_history._user_lock(username):
@@ -317,9 +295,7 @@ def _resolve_folder_path(folder_path: str | Path | None) -> Path:
317
  if folder_path is not None:
318
  return Path(folder_path).expanduser().resolve()
319
 
320
- if os.getenv("SYSTEM") == "spaces" and os.path.exists(
321
- "/data"
322
- ): # Persistent storage is enabled!
323
  return Path("/data") / "_user_history"
324
 
325
  # Not in a Space or Persistent storage not enabled => local folder
@@ -380,10 +356,8 @@ def _get_nb_users() -> int:
380
  user_history = _UserHistory()
381
  if not user_history.initialized:
382
  return 0
383
- if user_history.folder_path is not None:
384
- return len(
385
- [path for path in user_history.folder_path.iterdir() if path.is_dir()]
386
- )
387
  return 0
388
 
389
 
@@ -391,7 +365,7 @@ def _get_nb_images() -> int:
391
  user_history = _UserHistory()
392
  if not user_history.initialized:
393
  return 0
394
- if user_history.folder_path is not None:
395
  return len([path for path in user_history.folder_path.glob("*/images/*")])
396
  return 0
397
 
@@ -425,14 +399,10 @@ def _disk_space_warning_message() -> str:
425
 
426
 
427
  def _get_disk_usage(path: Path) -> Tuple[int, int, int]:
428
- for path in [path] + list(
429
- path.parents
430
- ): # first check target_dir, then each parents one by one
431
  try:
432
  return shutil.disk_usage(path)
433
- except (
434
- OSError
435
- ): # if doesn't exist or can't read => fail silently and try parent one
436
  pass
437
  return 0, 0, 0
438
 
@@ -451,74 +421,70 @@ def _fetch_admins() -> List[str]:
451
  # Running in Space => try to fetch organization members
452
  # Otherwise, it's not an organization => namespace is the user
453
  namespace = space_id.split("/")[0]
454
- response = requests.get(
455
- f"https://huggingface.co/api/organizations/{namespace}/members"
456
- )
457
  if response.status_code == 200:
458
- return sorted(
459
- (member["user"] for member in response.json()), key=lambda x: x.lower()
460
- )
461
  return [namespace]
462
 
463
 
464
- ################################################################
465
- # Legacy helpers to migrate image structure to new data format #
466
- ################################################################
467
- # TODO: remove this section once all Spaces have migrated
468
 
469
-
470
- def _migrate_history():
471
- """Script to migrate user history from v0 to v1."""
472
- legacy_history_path = _legacy_get_history_folder_path()
473
- if not legacy_history_path.exists():
474
- return
475
-
476
- error_count = 0
477
- for json_path in legacy_history_path.glob("*.json"):
478
- username = json_path.stem
479
- print(f"Migrating history for user {username}...")
480
- error_count += _legacy_move_user_history(username)
481
- print("Done.")
482
- print(f"Migration complete. {error_count} error(s) happened.")
483
-
484
- if error_count == 0:
485
- shutil.rmtree(legacy_history_path, ignore_errors=True)
486
-
487
-
488
- def _legacy_move_user_history(username: str) -> int:
489
- history = _legacy_read_user_history(username)
490
- error_count = 0
491
- for image, prompt in reversed(history):
492
- try:
493
- save_image(
494
- label=prompt, image=image, profile={"preferred_username": username}
495
- )
496
- except Exception as e:
497
- print("Issue while migrating image:", e)
498
- error_count += 1
499
- return error_count
500
 
501
 
502
- def _legacy_get_history_folder_path() -> Path:
503
- _folder = os.environ.get("HISTORY_FOLDER")
504
- if _folder is None:
505
- _folder = Path(__file__).parent / "history"
506
- return Path(_folder)
507
-
508
-
509
- def _legacy_read_user_history(username: str) -> List[Tuple[str, str]]:
510
- """Return saved history for that user."""
511
- with _legacy_user_lock(username):
512
- path = _legacy_user_history_path(username)
513
- if path.exists():
514
- return json.loads(path.read_text())
515
- return [] # No history yet
516
-
517
-
518
- def _legacy_user_history_path(username: str) -> Path:
519
- return _legacy_get_history_folder_path() / f"{username}.json"
520
-
521
 
522
- def _legacy_user_lock(username: str) -> FileLock:
523
- """Ensure history is not corrupted if concurrent calls."""
524
- return FileLock(f"{_legacy_user_history_path(username)}.lock")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  - Source file: https://huggingface.co/spaces/Wauplin/gradio-user-history/blob/main/user_history.py
16
  - Discussions: https://huggingface.co/spaces/Wauplin/gradio-user-history/discussions
17
  """
18
+ import hashlib
19
  import json
20
  import os
21
  import shutil
 
38
  user_history.folder_path = _resolve_folder_path(folder_path)
39
  user_history.initialized = True
40
 
41
+ # Clean duplicates
42
+ _clean_duplicates()
43
 
44
 
45
  def render() -> None:
 
47
 
48
  # initialize with default config
49
  if not user_history.initialized:
50
+ print("Initializing user history with default config. Use `user_history.setup(...)` to customize folder_path.")
 
 
51
  setup()
52
 
53
  # Render user history tab
 
82
 
83
  # "Export zip" row (hidden by default)
84
  with gr.Row():
85
+ export_file = gr.File(file_count="single", file_types=[".zip"], label="Exported history", visible=False)
 
 
 
 
 
86
 
87
  # "Config deletion" row (hidden by default)
88
  with gr.Row():
89
+ confirm_button = gr.Button("Confirm delete all history", variant="stop", visible=False)
 
 
90
  cancel_button = gr.Button("Cancel", visible=False)
91
 
92
  # Gallery
 
109
  gallery.attach_load_event(_fetch_user_history, every=None)
110
 
111
  # Interactions
112
+ refresh_button.click(fn=_fetch_user_history, inputs=[], outputs=[gallery], queue=False)
113
+ export_button.click(fn=_export_user_history, inputs=[], outputs=[export_file], queue=False)
 
 
 
 
114
 
115
  # Taken from https://github.com/gradio-app/gradio/issues/3324#issuecomment-1446382045
116
  delete_button.click(
 
191
 
192
  def _user_lock(self, username: str) -> FileLock:
193
  """Ensure history is not corrupted if concurrent calls."""
194
+ return FileLock(self.folder_path / f"{username}.lock") # lock outside of folder => better when exporting ZIP
 
 
195
 
196
  def _user_jsonl_path(self, username: str) -> Path:
197
  return self._user_path(username) / "history.jsonl"
 
211
 
212
  user_history = _UserHistory()
213
  if not user_history.initialized:
214
+ warnings.warn("User history is not set in Gradio demo. You must use `user_history.render(...)` first.")
 
 
215
  return []
216
 
217
  with user_history._user_lock(username):
 
237
 
238
  user_history = _UserHistory()
239
  if not user_history.initialized:
240
+ warnings.warn("User history is not set in Gradio demo. You must use `user_history.render(...)` first.")
 
 
241
  return None
242
 
243
  # Zip history
244
  with user_history._user_lock(username):
245
  path = shutil.make_archive(
246
+ str(_archives_path() / f"history_{username}"), "zip", user_history._user_path(username)
 
 
247
  )
248
 
249
  return gr.update(visible=True, value=path)
 
258
 
259
  user_history = _UserHistory()
260
  if not user_history.initialized:
261
+ warnings.warn("User history is not set in Gradio demo. You must use `user_history.render(...)` first.")
 
 
262
  return
263
 
264
  with user_history._user_lock(username):
 
295
  if folder_path is not None:
296
  return Path(folder_path).expanduser().resolve()
297
 
298
+ if os.getenv("SYSTEM") == "spaces" and os.path.exists("/data"): # Persistent storage is enabled!
 
 
299
  return Path("/data") / "_user_history"
300
 
301
  # Not in a Space or Persistent storage not enabled => local folder
 
356
  user_history = _UserHistory()
357
  if not user_history.initialized:
358
  return 0
359
+ if user_history.folder_path is not None and user_history.folder_path.exists():
360
+ return len([path for path in user_history.folder_path.iterdir() if path.is_dir()])
 
 
361
  return 0
362
 
363
 
 
365
  user_history = _UserHistory()
366
  if not user_history.initialized:
367
  return 0
368
+ if user_history.folder_path is not None and user_history.folder_path.exists():
369
  return len([path for path in user_history.folder_path.glob("*/images/*")])
370
  return 0
371
 
 
399
 
400
 
401
  def _get_disk_usage(path: Path) -> Tuple[int, int, int]:
402
+ for path in [path] + list(path.parents): # first check target_dir, then each parents one by one
 
 
403
  try:
404
  return shutil.disk_usage(path)
405
+ except OSError: # if doesn't exist or can't read => fail silently and try parent one
 
 
406
  pass
407
  return 0, 0, 0
408
 
 
421
  # Running in Space => try to fetch organization members
422
  # Otherwise, it's not an organization => namespace is the user
423
  namespace = space_id.split("/")[0]
424
+ response = requests.get(f"https://huggingface.co/api/organizations/{namespace}/members")
 
 
425
  if response.status_code == 200:
426
+ return sorted((member["user"] for member in response.json()), key=lambda x: x.lower())
 
 
427
  return [namespace]
428
 
429
 
430
+ #######
431
+ #######
 
 
432
 
433
+ # TODO: remove this once from IllusionDiffusion once cleaned
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
434
 
435
 
436
+ def _clean_duplicates() -> None:
437
+ user_history = _UserHistory()
438
+ if not (user_history.initialized and user_history.folder_path.exists()):
439
+ # Must be initialized correctly
440
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
441
 
442
+ _lock = user_history.folder_path / "_clean_duplicates.lock"
443
+ _is_done_file = user_history.folder_path / "_clean_duplicates_is_done" # Only 1 replica will do it, once for all
444
+
445
+ with FileLock(_lock):
446
+ if _is_done_file.exists(): # if True, another replica already did it
447
+ return
448
+
449
+ for subpath in user_history.folder_path.iterdir():
450
+ if subpath.is_file():
451
+ continue
452
+
453
+ history_file = subpath / "history.jsonl"
454
+ if not history_file.exists():
455
+ continue
456
+
457
+ # Read history
458
+ images = [json.loads(line) for line in history_file.read_text().splitlines()]
459
+
460
+ # Select unique images
461
+ curated_images = []
462
+ seen_hashes = set()
463
+ seen_paths = set()
464
+ for image in images:
465
+ image_hash = _file_hash(Path(image["path"]))
466
+ if image_hash is None:
467
+ continue
468
+ if image_hash in seen_hashes:
469
+ continue
470
+ seen_hashes.add(image_hash)
471
+ seen_paths.add(Path(image["path"]))
472
+ curated_images.append(image)
473
+
474
+ # Remove duplicates + save history
475
+ for path in subpath.glob("images/*"):
476
+ if path not in seen_paths:
477
+ try:
478
+ path.unlink()
479
+ except OSError:
480
+ pass
481
+ history_file.write_text("\n".join(json.dumps(image) for image in curated_images))
482
+
483
+ _is_done_file.touch()
484
+
485
+
486
+ def _file_hash(path: Path) -> str | None:
487
+ """Return the hash of a file. No need to read by chunks."""
488
+ if path.is_file():
489
+ return hashlib.md5(path.read_bytes()).hexdigest()
490
+ return None