Source file `bibtex.ml`

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
(* BibTeX Parser and Pretty Printer in OCaml *)
(* Edited from the version generated by Claude at
   https://claude.ai/share/6cdc04c1-ca20-4efc-93c2-474438dda8cf *)

(* Types for representing BibTeX entries *)
type field_value =
  | QuotedStringValue of string
  | BracedStringValue of string
  | UnquotedStringValue of string
  | NumberValue of int

type field = { name : string; value : field_value }

type entry_type =
  | Article
  | Book
  | Booklet
  | Conference
  | InBook
  | InCollection
  | InProceedings
  | Manual
  | MastersThesis
  | Misc
  | PhdThesis
  | Proceedings
  | TechReport
  | Unpublished

type entry_content = Field of field | EntryComment of string

type bibtex_entry = {
  entry_type : entry_type;
  citekey : string;
  contents : entry_content list;
}

type bibtex_item = Entry of bibtex_entry | Comment of string
type parse_error = { line : int; position : int; message : string }
type parse_result = { items : bibtex_item list; errors : parse_error list }

(* Utility functions *)
let string_of_entry_type = function
  | Article -> "article"
  | Book -> "book"
  | Booklet -> "booklet"
  | Conference -> "conference"
  | InBook -> "inbook"
  | InCollection -> "incollection"
  | InProceedings -> "inproceedings"
  | Manual -> "manual"
  | MastersThesis -> "mastersthesis"
  | Misc -> "misc"
  | PhdThesis -> "phdthesis"
  | Proceedings -> "proceedings"
  | TechReport -> "techreport"
  | Unpublished -> "unpublished"

let entry_type_of_string s =
  match String.lowercase_ascii s with
  | "article" -> Article
  | "book" -> Book
  | "booklet" -> Booklet
  | "conference" -> Conference
  | "inbook" -> InBook
  | "incollection" -> InCollection
  | "inproceedings" -> InProceedings
  | "manual" -> Manual
  | "mastersthesis" -> MastersThesis
  | "misc" -> Misc
  | "phdthesis" -> PhdThesis
  | "proceedings" -> Proceedings
  | "techreport" -> TechReport
  | "unpublished" -> Unpublished
  | _ -> Misc (* fallback to misc for unknown types *)

(* Simple Parser Combinator Library *)
(* type 'a parser = string -> int -> ('a * int) option *)

let return x _input pos = Some (x, pos)
(* let fail _input _pos = None *)

let bind p f input pos =
  match p input pos with Some (x, pos') -> f x input pos' | None -> None

let ( >>= ) = bind

let ( <|> ) p1 p2 input pos =
  match p1 input pos with Some result -> Some result | None -> p2 input pos

let rec many p input pos =
  match p input pos with
  | Some (x, pos') -> (
      match many p input pos' with
      | Some (xs, pos'') -> Some (x :: xs, pos'')
      | None -> Some ([ x ], pos'))
  | None -> Some ([], pos)

let many_with_errors p input pos =
  let get_line_number pos =
    let rec count_lines pos line_num =
      if pos <= 0 then line_num
      else if input.[pos - 1] = '\n' then count_lines (pos - 1) (line_num + 1)
      else count_lines (pos - 1) line_num
    in
    count_lines pos 1
  in
  let rec aux acc_items acc_errors curr_pos =
    match p input curr_pos with
    | Some (x, pos') -> aux (x :: acc_items) acc_errors pos'
    | None ->
        (* Try to skip to next entry by finding the next '@' character *)
        let rec find_next_entry pos =
          if pos >= String.length input then pos
          else if input.[pos] = '@' then pos
          else find_next_entry (pos + 1)
        in
        let next_pos = find_next_entry curr_pos in
        if next_pos >= String.length input then
          (* End of input *)
          Some (List.rev acc_items, List.rev acc_errors, curr_pos)
        else if next_pos = curr_pos then
          (* We're at an '@' but couldn't parse, record error and skip *)
          let error =
            {
              line = get_line_number curr_pos;
              position = curr_pos;
              message =
                "Failed to parse entry starting at line "
                ^ string_of_int (get_line_number curr_pos);
            }
          in
          aux acc_items (error :: acc_errors) (next_pos + 1)
        else
          (* Found next '@', try parsing again *)
          let error =
            {
              line = get_line_number curr_pos;
              position = curr_pos;
              message =
                "Skipped unparsable content from line "
                ^ string_of_int (get_line_number curr_pos)
                ^ " to line "
                ^ string_of_int (get_line_number next_pos);
            }
          in
          aux acc_items (error :: acc_errors) next_pos
  in
  aux [] [] pos

(* let many1 p =
  p >>= fun x ->
  many p >>= fun xs -> return (x :: xs) *)

let optional p input pos =
  match p input pos with
  | Some (x, pos') -> Some (Some x, pos')
  | None -> Some (None, pos)

let peek_char input pos =
  if pos < String.length input then Some input.[pos] else None

let char c input pos =
  match peek_char input pos with
  | Some c' when c = c' -> Some (c, pos + 1)
  | _ -> None

let satisfy pred input pos =
  match peek_char input pos with
  | Some c when pred c -> Some (c, pos + 1)
  | _ -> None

let rec skip_while pred input pos =
  match peek_char input pos with
  | Some c when pred c -> skip_while pred input (pos + 1)
  | _ -> pos

(* Helper function to skip whitespace and return new position *)
let skip_whitespace input pos =
  skip_while
    (function ' ' | '\t' | '\n' | '\r' -> true | _ -> false)
    input pos

let whitespace input pos =
  let pos' = skip_whitespace input pos in
  Some ((), pos')

let ws p =
  whitespace >>= fun _ ->
  p >>= fun x ->
  whitespace >>= fun _ -> return x

(* let string s input pos =
  let len = String.length s in
  if pos + len <= String.length input && String.sub input pos len = s then
    Some (s, pos + len)
  else None *)

let take_while pred input pos =
  let start = pos in
  let pos' = skip_while pred input pos in
  if pos' > start then Some (String.sub input start (pos' - start), pos')
  else None

let take_while1 pred input pos =
  match take_while pred input pos with
  | Some (s, pos') when String.length s > 0 -> Some (s, pos')
  | _ -> None

(* BibTeX specific parsers *)
let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
let is_digit c = c >= '0' && c <= '9'
let is_alnum c = is_alpha c || is_digit c
let is_ident_char c = is_alnum c || c = '_' || c = '-' || c = ':'

(* More permissive field name parser to handle dots, plus signs, etc. *)
let is_field_name_char c =
  is_alnum c || c = '_' || c = '-' || c = ':' || c = '.' || c = '+'

let identifier =
  satisfy (fun c -> is_alpha c || c = '_') >>= fun first ->
  take_while is_ident_char >>= fun rest -> return (String.make 1 first ^ rest)

let field_name =
  satisfy (fun c -> is_alpha c || c = '_') >>= fun first ->
  take_while is_field_name_char >>= fun rest ->
  return (String.make 1 first ^ rest)

let number =
  take_while1 is_digit >>= fun digits -> return (int_of_string digits)

(* Helper to handle multi-byte UTF-8 characters safely *)
let is_utf8_continuation_byte c =
  let code = Char.code c in
  code >= 0x80 && code <= 0xBF

(* Helper function to collect UTF-8 multi-byte character sequence *)
let collect_utf8_sequence input pos start_char =
  let rec collect_utf8 bytes_left acc_utf8 curr_pos =
    if bytes_left = 0 then (acc_utf8, curr_pos)
    else
      match peek_char input curr_pos with
      | Some c when is_utf8_continuation_byte c ->
          collect_utf8 (bytes_left - 1)
            (acc_utf8 ^ String.make 1 c)
            (curr_pos + 1)
      | _ -> (acc_utf8, curr_pos)
  in
  let bytes_to_read =
    let code = Char.code start_char in
    if code >= 0xC0 && code <= 0xDF then 1
    else if code >= 0xE0 && code <= 0xEF then 2
    else if code >= 0xF0 && code <= 0xF7 then 3
    else 0
  in
  collect_utf8 bytes_to_read (String.make 1 start_char) (pos + 1)

let quoted_string input pos =
  match char '"' input pos with
  | Some (_, pos1) ->
      let rec loop acc pos brace_level =
        match peek_char input pos with
        | None -> None
        | Some '"' when brace_level = 0 -> Some (acc, pos + 1)
        | Some '{' -> loop (acc ^ "{") (pos + 1) (brace_level + 1)
        | Some '}' -> loop (acc ^ "}") (pos + 1) (max 0 (brace_level - 1))
        | Some '\\' when pos + 1 < String.length input ->
            let escaped = String.sub input (pos + 1) 1 in
            loop (acc ^ "\\" ^ escaped) (pos + 2) brace_level
        | Some c ->
            (* Check if this is a UTF-8 multi-byte character *)
            if Char.code c >= 0xC0 then
              let utf8_seq, new_pos = collect_utf8_sequence input pos c in
              loop (acc ^ utf8_seq) new_pos brace_level
            else
              (* ASCII or continuation byte (shouldn't happen here) *)
              loop (acc ^ String.make 1 c) (pos + 1) brace_level
      in
      loop "" pos1 0
  | None -> None

let braced_string input pos =
  match char '{' input pos with
  | Some (_, pos1) ->
      let rec loop acc pos brace_level =
        match peek_char input pos with
        | None -> None
        | Some '}' when brace_level = 1 -> Some (acc, pos + 1)
        | Some '{' -> loop (acc ^ "{") (pos + 1) (brace_level + 1)
        | Some '}' -> loop (acc ^ "}") (pos + 1) (brace_level - 1)
        | Some '\\' when pos + 1 < String.length input ->
            let escaped = String.sub input (pos + 1) 1 in
            loop (acc ^ "\\" ^ escaped) (pos + 2) brace_level
        | Some c ->
            (* Check if this is a UTF-8 multi-byte character *)
            if Char.code c >= 0xC0 then
              let utf8_seq, new_pos = collect_utf8_sequence input pos c in
              loop (acc ^ utf8_seq) new_pos brace_level
            else
              (* ASCII or continuation byte (shouldn't happen here) *)
              loop (acc ^ String.make 1 c) (pos + 1) brace_level
      in
      loop "" pos1 1
  | None -> None

(* Parser for unquoted values like month=jul *)
let is_unquoted_value_char c =
  is_alnum c || c = '_' || c = '-' || c = '.' || c = '/' || c = ':' || c = '+'
  || c = '~' || c = '@' || c = '#' || c = '$' || c = '%' || c = '^' || c = '&'
  || c = '*' || c = '(' || c = ')' || c = '[' || c = ']'

let unquoted_value = take_while1 is_unquoted_value_char

(* Enhanced field_value parser that can handle unquoted values like month=jul *)
let field_value =
  quoted_string
  >>= (fun s -> return (QuotedStringValue s))
  <|> (braced_string >>= fun s -> return (BracedStringValue s))
  <|> (number >>= fun n -> return (NumberValue n))
  <|> (unquoted_value >>= fun s -> return (UnquotedStringValue s))

let field_entry =
  ws field_name >>= fun name ->
  ws (char '=') >>= fun _ ->
  ws field_value >>= fun value -> return { name; value }

(* Parse comments inside bibtex entries *)
let parse_comment_line input pos =
  match peek_char input pos with
  | Some '%' ->
      (* Get the comment text without the % *)
      let comment_start = pos + 1 in
      let rec find_eol pos =
        match peek_char input pos with
        | None -> pos
        | Some '\n' -> pos
        | Some '\r' -> pos
        | Some _ -> find_eol (pos + 1)
      in
      let eol_pos = find_eol (pos + 1) in
      let comment_text =
        if eol_pos > comment_start then
          String.sub input comment_start (eol_pos - comment_start)
        else ""
      in

      (* Skip past the EOL character(s) *)
      let next_pos =
        match peek_char input eol_pos with
        | None -> eol_pos
        | Some '\n' -> eol_pos + 1
        | Some '\r' ->
            if eol_pos + 1 < String.length input && input.[eol_pos + 1] = '\n'
            then eol_pos + 2
            else eol_pos + 1
        | Some _ -> eol_pos (* Should not happen *)
      in

      Some (comment_text, next_pos)
  | _ -> None

(* Parse an entry comment *)
let entry_comment_parser input pos =
  match parse_comment_line input pos with
  | Some (comment_text, pos') -> Some (EntryComment comment_text, pos')
  | None -> None

(* Parse either a field or a comment within an entry *)
(* let entry_content_parser input pos =
  match entry_comment_parser input pos with
  | Some result -> Some result
  | None -> (
      match field_entry input pos with
      | Some (field, pos') -> Some (Field field, pos')
      | None -> None) *)

(* Whitespace parser that collects comments within entries *)
(* let ws_with_comments p input pos =
  let pos' = skip_whitespace input pos in
  match p input pos' with
  | Some (x, pos'') ->
      let pos''' = skip_whitespace input pos'' in
      Some (x, pos''')
  | None -> None *)

(* Parse the contents of an entry - both fields and comments *)
let entry_contents_parser input pos =
  let parse_item input pos =
    let pos_after_ws = skip_whitespace input pos in
    (* Try to parse a comment first *)
    match entry_comment_parser input pos_after_ws with
    | Some (comment, pos') -> Some (comment, pos')
    | None -> (
        (* Try to parse a field *)
        match field_entry input pos_after_ws with
        | Some (field, pos') -> Some (Field field, pos')
        | None -> None)
  in
  let rec parse_contents acc input pos =
    let pos_after_ws = skip_whitespace input pos in
    (* Check for end of entry *)
    match peek_char input pos_after_ws with
    | Some '}' ->
        (* End of entry *)
        Some (List.rev acc, pos_after_ws)
    | Some ',' ->
        (* Skip comma and continue *)
        parse_contents acc input (pos_after_ws + 1)
    | _ -> (
        (* Try to parse an item (field or comment) *)
        match parse_item input pos_after_ws with
        | Some (item, pos') ->
            (* Got an item, look for comma or end *)
            let pos_after_ws' = skip_whitespace input pos' in
            (* Continue parsing with this item added *)
            parse_contents (item :: acc) input pos_after_ws'
        | None -> (
            (* No valid item found, check if we're at the end *)
            match peek_char input pos_after_ws with
            | Some '}' -> Some (List.rev acc, pos_after_ws)
            | _ -> None))
  in
  parse_contents [] input pos

let entry_type_parser =
  identifier >>= fun type_str -> return (entry_type_of_string type_str)

let bibtex_entry =
  ws (char '@') >>= fun _ ->
  entry_type_parser >>= fun entry_type ->
  ws (char '{') >>= fun _ ->
  ws identifier >>= fun citekey ->
  optional (ws (char ',')) >>= fun _ ->
  (* Parse contents (fields and comments) *)
  entry_contents_parser >>= fun contents ->
  ws (char '}') >>= fun _ -> return { entry_type; citekey; contents }

let comment =
  char '%' >>= fun _ ->
  take_while (fun c -> c <> '\n' && c <> '\r') >>= fun text -> return text

let bibtex_item =
  bibtex_entry
  >>= (fun entry -> return (Entry entry))
  <|> (comment >>= fun text -> return (Comment text))

let bibtex_file =
  many (ws bibtex_item) >>= fun items ->
  ws (return ()) >>= fun _ -> return items

let bibtex_file_with_errors input pos =
  match many_with_errors (ws bibtex_item) input pos with
  | Some (items, errors, final_pos) -> (
      match ws (return ()) input final_pos with
      | Some (_, end_pos) -> Some ({ items; errors }, end_pos)
      | None -> Some ({ items; errors }, final_pos))
  | None -> Some ({ items = []; errors = [] }, pos)

(* Parser runner *)
let parse_with parser input =
  match parser input 0 with Some (result, _) -> Some result | None -> None

let parse_bibtex input =
  match parse_with bibtex_file input with Some items -> items | None -> []

let parse_bibtex_with_errors input =
  match parse_with bibtex_file_with_errors input with
  | Some result -> result
  | None ->
      {
        items = [];
        errors =
          [
            { line = 1; position = 0; message = "Failed to parse BibTeX file" };
          ];
      }

let has_parse_errors result = result.errors <> []
let get_parse_errors result = result.errors
let get_parsed_items result = result.items

type options = { capitalize_names : bool; strict : bool; align_entries : bool }

let default_options =
  { capitalize_names = true; strict = false; align_entries = true }

(* String replacement helper for Unicode normalization *)
let replace_string ~pattern ~replacement text =
  let pattern_len = String.length pattern in
  if pattern_len = 0 then text
  else
    let rec find_and_replace start_pos acc =
      try
        let match_pos = String.index_from text start_pos pattern.[0] in
        if
          match_pos + pattern_len <= String.length text
          && String.sub text match_pos pattern_len = pattern
        then
          (* Found a match - add up to match, add replacement, continue *)
          let prefix = String.sub text start_pos (match_pos - start_pos) in
          find_and_replace (match_pos + pattern_len) (acc ^ prefix ^ replacement)
        else
          (* False positive - add the character and continue *)
          let next_pos = match_pos + 1 in
          let prefix = String.sub text start_pos (next_pos - start_pos) in
          find_and_replace next_pos (acc ^ prefix)
      with Not_found ->
        (* No more matches - add the rest of the string *)
        if start_pos < String.length text then
          acc ^ String.sub text start_pos (String.length text - start_pos)
        else acc
    in
    find_and_replace 0 ""

(* Normalize Unicode to ASCII equivalents where possible *)
let escape_table =
  let escapes =
    List.map
      (fun s -> Re.compile @@ Re.str s)
      [ "%2F"; "%28"; "%29"; "%3C"; "%3E"; "%3A"; "%3B"; "%3F"; "%26" ]
  in
  let chars = [ "/"; "("; ")"; "<"; ">"; ":"; ";"; "?"; "&" ] in
  List.combine escapes chars

let unescape_url s =
  List.fold_left
    (fun s (re_code, chr) -> Re.replace_string ~all:true re_code ~by:chr s)
    s escape_table

let normalize_unicode s =
  (* This is a simplified version that just handles a few common cases *)
  let replacements =
    [
      ("\226\128\147", "-");
      (* en-dash *)
      ("\226\128\148", "--");
      (* em-dash *)
      ("\226\128\156", "\"");
      (* left double quote *)
      ("\226\128\157", "\"");
      (* right double quote *)
      ("\226\128\152", "'");
      (* left single quote *)
      ("\226\128\153", "'");
      (* right single quote *)
    ]
  in
  let s_ref = ref s in
  List.iter
    (fun (from, to_) ->
      s_ref := replace_string ~pattern:from ~replacement:to_ !s_ref)
    replacements;
  !s_ref

(* Pretty printer *)
let format_field_value = function
  | QuotedStringValue s ->
      let normalized_s = normalize_unicode s in
      "\"" ^ normalized_s ^ "\""
  | BracedStringValue s ->
      let normalized_s = normalize_unicode s in
      "{" ^ normalized_s ^ "}"
  | UnquotedStringValue s ->
      let normalized_s = normalize_unicode s in
      "{" ^ normalized_s ^ "}"
  | NumberValue n -> string_of_int n

let format_field_value_with_url_unescaping field_name field_value =
  if String.lowercase_ascii field_name = "url" then
    match field_value with
    | QuotedStringValue s ->
        let unescaped_s = unescape_url s in
        let normalized_s = normalize_unicode unescaped_s in
        "\"" ^ normalized_s ^ "\""
    | BracedStringValue s ->
        let unescaped_s = unescape_url s in
        let normalized_s = normalize_unicode unescaped_s in
        "{" ^ normalized_s ^ "}"
    | UnquotedStringValue s ->
        let unescaped_s = unescape_url s in
        let normalized_s = normalize_unicode unescaped_s in
        "{" ^ normalized_s ^ "}"
    | NumberValue n -> string_of_int n
  else format_field_value field_value

let format_entry_name capitalized name =
  if capitalized then String.uppercase_ascii name else name

let format_field_with_padding capitalized field max_width =
  let padding = String.make (max_width - String.length field.name) ' ' in
  let name = format_entry_name capitalized field.name in
  "  " ^ name ^ padding ^ " = "
  ^ format_field_value_with_url_unescaping field.name field.value

let format_field capitalized field =
  "  "
  ^ format_entry_name capitalized field.name
  ^ " = "
  ^ format_field_value_with_url_unescaping field.name field.value

let format_entry_content_with_padding capitalized max_width = function
  | Field field -> format_field_with_padding capitalized field max_width
  | EntryComment comment -> "  %" ^ comment

let format_entry_content capitalized = function
  | Field field -> format_field capitalized field
  | EntryComment comment -> "  %" ^ comment

let format_entry options entry =
  (if options.strict then
     (* Maybe this should be moved to the parser.
        For now here is good enough. *)
     let field_names =
       List.fold_left
         (fun acc content ->
           match content with
           | Field field -> String.lowercase_ascii field.name :: acc
           | EntryComment _ -> acc)
         [] entry.contents
     in
     let has_duplicates =
       List.length field_names
       <> List.length (List.sort_uniq String.compare field_names)
     in
     if has_duplicates then
       failwith ("Duplicate fields found in entry: " ^ entry.citekey));

  (* Format the entry *)
  let format_entry =
    if options.align_entries then
      let max_field_width =
        (* Calculate maximum field name length for alignment *)
        List.fold_left
          (fun acc content ->
            match content with
            | Field field -> max acc (String.length field.name)
            | EntryComment _ -> acc)
          0 entry.contents
      in
      format_entry_content_with_padding options.capitalize_names max_field_width
    else format_entry_content options.capitalize_names
  in
  let entry_type_str = string_of_entry_type entry.entry_type in
  let header = "@" ^ entry_type_str ^ "{" ^ entry.citekey in
  let contents_str =
    if entry.contents = [] then ""
    else
      let formatted_contents =
        entry.contents
        (* DOI's bibtex API adds the "month" field but raises a warning with
        many bib engines, and it is kind of useless. So we filter it out *)
        |> List.filter (function
             | Field f when String.(lowercase_ascii @@ f.name) = "month" ->
                 false
             | _ -> true)
        |> List.map format_entry
      in
      let rec add_commas_except_last = function
        | [] -> []
        | [ last ] -> [ last ] (* No comma for the last item *)
        | content :: rest ->
            let comma_content =
              let trimmed = String.trim content in
              if String.length trimmed > 0 && trimmed.[0] = '%' then content
              else content ^ ","
            in
            comma_content :: add_commas_except_last rest
      in
      let contents_with_commas = add_commas_except_last formatted_contents in
      ",\n" ^ String.concat "\n" contents_with_commas
  in
  header ^ contents_str ^ "\n}"

let format_bibtex_item options = function
  | Entry entry -> format_entry options entry ^ "\n"
  | Comment comment -> "%" ^ comment

let pretty_print_bibtex ?options items =
  let options = Option.value options ~default:default_options in
  String.concat "\n" (List.map (format_bibtex_item options) items)

(* Helper function to clean and normalize BibTeX entries *)
let clean_bibtex ?options input =
  let parsed_items = parse_bibtex input in
  pretty_print_bibtex ?options parsed_items