1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
type field_value =
| QuotedStringValue of string
| BracedStringValue of string
| UnquotedStringValue of string
| NumberValue of int
type field = { name : string; value : field_value }
type entry_type =
| Article
| Book
| Booklet
| Conference
| InBook
| InCollection
| InProceedings
| Manual
| MastersThesis
| Misc
| PhdThesis
| Proceedings
| TechReport
| Unpublished
type entry_content = Field of field
type bibtex_entry = {
entry_type : entry_type;
citekey : string;
contents : entry_content list;
}
type bibtex_item = Entry of bibtex_entry
type parse_error = { line : int; position : int; message : string }
type parse_result = { items : bibtex_item list; errors : parse_error list }
let string_of_entry_type = function
| Article -> "article"
| Book -> "book"
| Booklet -> "booklet"
| Conference -> "conference"
| InBook -> "inbook"
| InCollection -> "incollection"
| InProceedings -> "inproceedings"
| Manual -> "manual"
| MastersThesis -> "mastersthesis"
| Misc -> "misc"
| PhdThesis -> "phdthesis"
| Proceedings -> "proceedings"
| TechReport -> "techreport"
| Unpublished -> "unpublished"
let entry_type_of_string s =
match String.lowercase_ascii s with
| "article" -> Article
| "book" -> Book
| "booklet" -> Booklet
| "conference" -> Conference
| "inbook" -> InBook
| "incollection" -> InCollection
| "inproceedings" -> InProceedings
| "manual" -> Manual
| "mastersthesis" -> MastersThesis
| "misc" -> Misc
| "phdthesis" -> PhdThesis
| "proceedings" -> Proceedings
| "techreport" -> TechReport
| "unpublished" -> Unpublished
| _ -> Misc
let return x _input pos = Some (x, pos)
let bind p f input pos =
match p input pos with Some (x, pos') -> f x input pos' | None -> None
let ( >>= ) = bind
let ( <|> ) p1 p2 input pos =
match p1 input pos with Some result -> Some result | None -> p2 input pos
let rec many p input pos =
match p input pos with
| Some (x, pos') -> (
match many p input pos' with
| Some (xs, pos'') -> Some (x :: xs, pos'')
| None -> Some ([ x ], pos'))
| None -> Some ([], pos)
let many_with_errors p input pos =
let get_line_number pos =
let rec count_lines pos line_num =
if pos <= 0 then line_num
else if input.[pos - 1] = '\n' then count_lines (pos - 1) (line_num + 1)
else count_lines (pos - 1) line_num
in
count_lines pos 1
in
let rec aux acc_items acc_errors curr_pos =
match p input curr_pos with
| Some (x, pos') -> aux (x :: acc_items) acc_errors pos'
| None ->
let rec find_next_entry pos =
if pos >= String.length input then pos
else if input.[pos] = '@' then pos
else find_next_entry (pos + 1)
in
let next_pos = find_next_entry curr_pos in
if next_pos >= String.length input then
Some (List.rev acc_items, List.rev acc_errors, curr_pos)
else if next_pos = curr_pos then
let error =
{
line = get_line_number curr_pos;
position = curr_pos;
message =
"Failed to parse entry starting at line "
^ string_of_int (get_line_number curr_pos);
}
in
aux acc_items (error :: acc_errors) (next_pos + 1)
else
let error =
{
line = get_line_number curr_pos;
position = curr_pos;
message =
"Skipped unparsable content from line "
^ string_of_int (get_line_number curr_pos)
^ " to line "
^ string_of_int (get_line_number next_pos);
}
in
aux acc_items (error :: acc_errors) next_pos
in
aux [] [] pos
let optional p input pos =
match p input pos with
| Some (x, pos') -> Some (Some x, pos')
| None -> Some (None, pos)
let peek_char input pos =
if pos < String.length input then Some input.[pos] else None
let char c input pos =
match peek_char input pos with
| Some c' when c = c' -> Some (c, pos + 1)
| _ -> None
let satisfy pred input pos =
match peek_char input pos with
| Some c when pred c -> Some (c, pos + 1)
| _ -> None
let rec skip_while pred input pos =
match peek_char input pos with
| Some c when pred c -> skip_while pred input (pos + 1)
| _ -> pos
let skip_whitespace input pos =
skip_while
(function ' ' | '\t' | '\n' | '\r' -> true | _ -> false)
input pos
let whitespace input pos =
let pos' = skip_whitespace input pos in
Some ((), pos')
let ws p =
whitespace >>= fun _ ->
p >>= fun x ->
whitespace >>= fun _ -> return x
let take_while pred input pos =
let start = pos in
let pos' = skip_while pred input pos in
if pos' > start then Some (String.sub input start (pos' - start), pos')
else None
let take_while1 pred input pos =
match take_while pred input pos with
| Some (s, pos') when String.length s > 0 -> Some (s, pos')
| _ -> None
let is_alpha c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
let is_digit c = c >= '0' && c <= '9'
let is_alnum c = is_alpha c || is_digit c
let is_ident_char c = is_alnum c || c = '_' || c = '-' || c = ':'
let is_field_name_char c =
is_alnum c || c = '_' || c = '-' || c = ':' || c = '.' || c = '+'
let identifier =
satisfy (fun c -> is_alpha c || c = '_') >>= fun first ->
take_while is_ident_char >>= fun rest -> return (String.make 1 first ^ rest)
let field_name =
satisfy (fun c -> is_alpha c || c = '_') >>= fun first ->
take_while is_field_name_char >>= fun rest ->
return (String.make 1 first ^ rest)
let number =
take_while1 is_digit >>= fun digits -> return (int_of_string digits)
let is_utf8_continuation_byte c =
let code = Char.code c in
code >= 0x80 && code <= 0xBF
let collect_utf8_sequence input pos start_char =
let rec collect_utf8 bytes_left acc_utf8 curr_pos =
if bytes_left = 0 then (acc_utf8, curr_pos)
else
match peek_char input curr_pos with
| Some c when is_utf8_continuation_byte c ->
collect_utf8 (bytes_left - 1)
(acc_utf8 ^ String.make 1 c)
(curr_pos + 1)
| _ -> (acc_utf8, curr_pos)
in
let bytes_to_read =
let code = Char.code start_char in
if code >= 0xC0 && code <= 0xDF then 1
else if code >= 0xE0 && code <= 0xEF then 2
else if code >= 0xF0 && code <= 0xF7 then 3
else 0
in
collect_utf8 bytes_to_read (String.make 1 start_char) (pos + 1)
let quoted_string input pos =
match char '"' input pos with
| Some (_, pos1) ->
let rec loop acc pos brace_level =
match peek_char input pos with
| None -> None
| Some '"' when brace_level = 0 -> Some (acc, pos + 1)
| Some '{' -> loop (acc ^ "{") (pos + 1) (brace_level + 1)
| Some '}' -> loop (acc ^ "}") (pos + 1) (max 0 (brace_level - 1))
| Some '\\' when pos + 1 < String.length input ->
let escaped = String.sub input (pos + 1) 1 in
loop (acc ^ "\\" ^ escaped) (pos + 2) brace_level
| Some c ->
if Char.code c >= 0xC0 then
let utf8_seq, new_pos = collect_utf8_sequence input pos c in
loop (acc ^ utf8_seq) new_pos brace_level
else
loop (acc ^ String.make 1 c) (pos + 1) brace_level
in
loop "" pos1 0
| None -> None
let braced_string input pos =
match char '{' input pos with
| Some (_, pos1) ->
let rec loop acc pos brace_level =
match peek_char input pos with
| None -> None
| Some '}' when brace_level = 1 -> Some (acc, pos + 1)
| Some '{' -> loop (acc ^ "{") (pos + 1) (brace_level + 1)
| Some '}' -> loop (acc ^ "}") (pos + 1) (brace_level - 1)
| Some '\\' when pos + 1 < String.length input ->
let escaped = String.sub input (pos + 1) 1 in
loop (acc ^ "\\" ^ escaped) (pos + 2) brace_level
| Some c ->
if Char.code c >= 0xC0 then
let utf8_seq, new_pos = collect_utf8_sequence input pos c in
loop (acc ^ utf8_seq) new_pos brace_level
else
loop (acc ^ String.make 1 c) (pos + 1) brace_level
in
loop "" pos1 1
| None -> None
let is_unquoted_value_char c =
is_alnum c || c = '_' || c = '-' || c = '.' || c = '/' || c = ':' || c = '+'
|| c = '~' || c = '@' || c = '#' || c = '$' || c = '%' || c = '^' || c = '&'
|| c = '*' || c = '(' || c = ')' || c = '[' || c = ']'
let unquoted_value = take_while1 is_unquoted_value_char
let field_value =
quoted_string
>>= (fun s -> return (QuotedStringValue s))
<|> (braced_string >>= fun s -> return (BracedStringValue s))
<|> (number >>= fun n -> return (NumberValue n))
<|> (unquoted_value >>= fun s -> return (UnquotedStringValue s))
let field_entry =
ws field_name >>= fun name ->
ws (char '=') >>= fun _ ->
ws field_value >>= fun value -> return { name; value }
let input pos =
match peek_char input pos with
| Some '%' ->
let = pos + 1 in
let rec find_eol pos =
match peek_char input pos with
| None -> pos
| Some '\n' -> pos
| Some '\r' -> pos
| Some _ -> find_eol (pos + 1)
in
let eol_pos = find_eol (pos + 1) in
let =
if eol_pos > comment_start then
String.sub input comment_start (eol_pos - comment_start)
else ""
in
let next_pos =
match peek_char input eol_pos with
| None -> eol_pos
| Some '\n' -> eol_pos + 1
| Some '\r' ->
if eol_pos + 1 < String.length input && input.[eol_pos + 1] = '\n'
then eol_pos + 2
else eol_pos + 1
| Some _ -> eol_pos
in
Some (comment_text, next_pos)
| _ -> None
let input pos =
match parse_comment_line input pos with
| Some (, pos') -> Some (EntryComment comment_text, pos')
| None -> None
let entry_contents_parser input pos =
let parse_item input pos =
let pos_after_ws = skip_whitespace input pos in
match entry_comment_parser input pos_after_ws with
| Some (, pos') -> Some (comment, pos')
| None -> (
match field_entry input pos_after_ws with
| Some (field, pos') -> Some (Field field, pos')
| None -> None)
in
let rec parse_contents acc input pos =
let pos_after_ws = skip_whitespace input pos in
match peek_char input pos_after_ws with
| Some '}' ->
Some (List.rev acc, pos_after_ws)
| Some ',' ->
parse_contents acc input (pos_after_ws + 1)
| _ -> (
match parse_item input pos_after_ws with
| Some (item, pos') ->
let pos_after_ws' = skip_whitespace input pos' in
parse_contents (item :: acc) input pos_after_ws'
| None -> (
match peek_char input pos_after_ws with
| Some '}' -> Some (List.rev acc, pos_after_ws)
| _ -> None))
in
parse_contents [] input pos
let entry_type_parser =
identifier >>= fun type_str -> return (entry_type_of_string type_str)
let bibtex_entry =
ws (char '@') >>= fun _ ->
entry_type_parser >>= fun entry_type ->
ws (char '{') >>= fun _ ->
ws identifier >>= fun citekey ->
optional (ws (char ',')) >>= fun _ ->
entry_contents_parser >>= fun contents ->
ws (char '}') >>= fun _ -> return { entry_type; citekey; contents }
let =
char '%' >>= fun _ ->
take_while (fun c -> c <> '\n' && c <> '\r') >>= fun text -> return text
let bibtex_item =
bibtex_entry
>>= (fun entry -> return (Entry entry))
<|> (comment >>= fun text -> return (Comment text))
let bibtex_file =
many (ws bibtex_item) >>= fun items ->
ws (return ()) >>= fun _ -> return items
let bibtex_file_with_errors input pos =
match many_with_errors (ws bibtex_item) input pos with
| Some (items, errors, final_pos) -> (
match ws (return ()) input final_pos with
| Some (_, end_pos) -> Some ({ items; errors }, end_pos)
| None -> Some ({ items; errors }, final_pos))
| None -> Some ({ items = []; errors = [] }, pos)
let parse_with parser input =
match parser input 0 with Some (result, _) -> Some result | None -> None
let parse_bibtex input =
match parse_with bibtex_file input with Some items -> items | None -> []
let parse_bibtex_with_errors input =
match parse_with bibtex_file_with_errors input with
| Some result -> result
| None ->
{
items = [];
errors =
[
{ line = 1; position = 0; message = "Failed to parse BibTeX file" };
];
}
let has_parse_errors result = result.errors <> []
let get_parse_errors result = result.errors
let get_parsed_items result = result.items
type options = { capitalize_names : bool; strict : bool; align_entries : bool }
let default_options =
{ capitalize_names = true; strict = false; align_entries = true }
let replace_string ~pattern ~replacement text =
let pattern_len = String.length pattern in
if pattern_len = 0 then text
else
let rec find_and_replace start_pos acc =
try
let match_pos = String.index_from text start_pos pattern.[0] in
if
match_pos + pattern_len <= String.length text
&& String.sub text match_pos pattern_len = pattern
then
let prefix = String.sub text start_pos (match_pos - start_pos) in
find_and_replace (match_pos + pattern_len) (acc ^ prefix ^ replacement)
else
let next_pos = match_pos + 1 in
let prefix = String.sub text start_pos (next_pos - start_pos) in
find_and_replace next_pos (acc ^ prefix)
with Not_found ->
if start_pos < String.length text then
acc ^ String.sub text start_pos (String.length text - start_pos)
else acc
in
find_and_replace 0 ""
let escape_table =
let escapes =
List.map
(fun s -> Re.compile @@ Re.str s)
[ "%2F"; "%28"; "%29"; "%3C"; "%3E"; "%3A"; "%3B"; "%3F"; "%26" ]
in
let chars = [ "/"; "("; ")"; "<"; ">"; ":"; ";"; "?"; "&" ] in
List.combine escapes chars
let unescape_url s =
List.fold_left
(fun s (re_code, chr) -> Re.replace_string ~all:true re_code ~by:chr s)
s escape_table
let normalize_unicode s =
let replacements =
[
("\226\128\147", "-");
("\226\128\148", "--");
("\226\128\156", "\"");
("\226\128\157", "\"");
("\226\128\152", "'");
("\226\128\153", "'");
]
in
let s_ref = ref s in
List.iter
(fun (from, to_) ->
s_ref := replace_string ~pattern:from ~replacement:to_ !s_ref)
replacements;
!s_ref
let format_field_value = function
| QuotedStringValue s ->
let normalized_s = normalize_unicode s in
"\"" ^ normalized_s ^ "\""
| BracedStringValue s ->
let normalized_s = normalize_unicode s in
"{" ^ normalized_s ^ "}"
| UnquotedStringValue s ->
let normalized_s = normalize_unicode s in
"{" ^ normalized_s ^ "}"
| NumberValue n -> string_of_int n
let format_field_value_with_url_unescaping field_name field_value =
if String.lowercase_ascii field_name = "url" then
match field_value with
| QuotedStringValue s ->
let unescaped_s = unescape_url s in
let normalized_s = normalize_unicode unescaped_s in
"\"" ^ normalized_s ^ "\""
| BracedStringValue s ->
let unescaped_s = unescape_url s in
let normalized_s = normalize_unicode unescaped_s in
"{" ^ normalized_s ^ "}"
| UnquotedStringValue s ->
let unescaped_s = unescape_url s in
let normalized_s = normalize_unicode unescaped_s in
"{" ^ normalized_s ^ "}"
| NumberValue n -> string_of_int n
else format_field_value field_value
let format_entry_name capitalized name =
if capitalized then String.uppercase_ascii name else name
let format_field_with_padding capitalized field max_width =
let padding = String.make (max_width - String.length field.name) ' ' in
let name = format_entry_name capitalized field.name in
" " ^ name ^ padding ^ " = "
^ format_field_value_with_url_unescaping field.name field.value
let format_field capitalized field =
" "
^ format_entry_name capitalized field.name
^ " = "
^ format_field_value_with_url_unescaping field.name field.value
let format_entry_content_with_padding capitalized max_width = function
| Field field -> format_field_with_padding capitalized field max_width
| EntryComment -> " %" ^ comment
let format_entry_content capitalized = function
| Field field -> format_field capitalized field
| EntryComment -> " %" ^ comment
let format_entry options entry =
(if options.strict then
let field_names =
List.fold_left
(fun acc content ->
match content with
| Field field -> String.lowercase_ascii field.name :: acc
| EntryComment _ -> acc)
[] entry.contents
in
let has_duplicates =
List.length field_names
<> List.length (List.sort_uniq String.compare field_names)
in
if has_duplicates then
failwith ("Duplicate fields found in entry: " ^ entry.citekey));
let format_entry =
if options.align_entries then
let max_field_width =
List.fold_left
(fun acc content ->
match content with
| Field field -> max acc (String.length field.name)
| EntryComment _ -> acc)
0 entry.contents
in
format_entry_content_with_padding options.capitalize_names max_field_width
else format_entry_content options.capitalize_names
in
let entry_type_str = string_of_entry_type entry.entry_type in
let = "@" ^ entry_type_str ^ "{" ^ entry.citekey in
let contents_str =
if entry.contents = [] then ""
else
let formatted_contents =
entry.contents
|> List.filter (function
| Field f when String.(lowercase_ascii @@ f.name) = "month" ->
false
| _ -> true)
|> List.map format_entry
in
let rec add_commas_except_last = function
| [] -> []
| [ last ] -> [ last ]
| content :: rest ->
let comma_content =
let trimmed = String.trim content in
if String.length trimmed > 0 && trimmed.[0] = '%' then content
else content ^ ","
in
comma_content :: add_commas_except_last rest
in
let contents_with_commas = add_commas_except_last formatted_contents in
",\n" ^ String.concat "\n" contents_with_commas
in
header ^ contents_str ^ "\n}"
let format_bibtex_item options = function
| Entry entry -> format_entry options entry ^ "\n"
| Comment -> "%" ^ comment
let pretty_print_bibtex ?options items =
let options = Option.value options ~default:default_options in
String.concat "\n" (List.map (format_bibtex_item options) items)
let clean_bibtex ?options input =
let parsed_items = parse_bibtex input in
pretty_print_bibtex ?options parsed_items