% \iffalse meta-comment % %% File: tagpdf-backend.dtx % % Copyright (C) 2019-2025 Ulrike Fischer % % It may be distributed and/or modified under the conditions of the % LaTeX Project Public License (LPPL), either version 1.3c of this % license or (at your option) any later version. The latest version % of this license is in the file % % https://www.latex-project.org/lppl.txt % % This file is part of the "tagpdf bundle" (The Work in LPPL) % and all files in that bundle must be distributed together. % % ----------------------------------------------------------------------- % % The development version of the bundle can be found at % % https://github.com/latex3/tagpdf % % for those people who are interested. % %<*driver> \DocumentMetadata{} \documentclass{l3doc} \usepackage{array,booktabs,caption} \hypersetup{pdfauthor=Ulrike Fischer, pdftitle=tagpdf-mc module (tagpdf)} \begin{document} \DocInput{\jobname.dtx} \end{document} % % \fi % \title{^^A % The \pkg{tagpdf-luatex.def} \\ Driver for luatex ^^A % \\ Part of the tagpdf package % } % % \author{^^A % Ulrike Fischer\thanks % {^^A % E-mail: % \href{mailto:fischer@troubleshooting-tex.de} % {fischer@troubleshooting-tex.de}^^A % }^^A % } % % \date{Version 0.99l, released 2025-01-12} % \maketitle % \begin{implementation} % \begin{macrocode} %<@@=tag> %<*luatex> \ProvidesExplFile {tagpdf-luatex.def} {2025-01-12} {0.99l} {tagpdf~driver~for~luatex} % \end{macrocode} % \section{Loading the lua} % The space code requires that the fall back font has been loaded and initialized, % so we force that first. But perhaps this could be done in the kernel. % % \begin{macrocode} { \fontencoding{TU}\fontfamily{lmr}\fontseries{m}\fontshape{n}\fontsize{10pt}{10pt}\selectfont } \lua_now:e { tagpdf=require('tagpdf.lua') } % \end{macrocode} % % The following defines wrappers around prop and seq commands to store the % data also in lua tables. % I probably want also lua tables % I put them in the ltx.@@.tables namespaces % The tables will be named like the variables but without backslash % To access such a table with a dynamical name create a string and then use % ltx.@@.tables[string] % Old code, I'm not quite sure if this was a good idea. Now I have mix of table in % ltx.@@.tables and ltx.@@.mc/struct. And a lot is probably not needed. % TODO: this should be cleaned up, but at least roles are currently using % the table! % % \begin{macro} % { % \@@_prop_new:N, % \@@_seq_new:N, % \@@_prop_gput:Nnn, % \@@_seq_gput_right:Nn, % \@@_seq_gput_left:Nn, % \@@_seq_item:cn, % \@@_prop_item:cn, % \@@_seq_show:N, % \@@_prop_show:N % } % \begin{macrocode} \cs_set_protected:Npn \@@_prop_new:N #1 { \prop_new:N #1 \lua_now:e { ltx.@@.tables.\cs_to_str:N#1 = {} } } \cs_set_protected:Npn \@@_prop_new_linked:N #1 { \prop_new_linked:N #1 \lua_now:e { ltx.@@.tables.\cs_to_str:N#1 = {} } } \cs_set_protected:Npn \@@_seq_new:N #1 { \seq_new:N #1 \lua_now:e { ltx.@@.tables.\cs_to_str:N#1 = {} } } \cs_set_protected:Npn \@@_prop_gput:Nnn #1 #2 #3 { \prop_gput:Nnn #1 { #2 } { #3 } \lua_now:e { ltx.@@.tables.\cs_to_str:N#1 ["#2"] = "\lua_escape:n{#3}" } } \cs_set_protected:Npn \@@_seq_gput_right:Nn #1 #2 { \seq_gput_right:Nn #1 { #2 } \lua_now:e { table.insert(ltx.@@.tables.\cs_to_str:N#1, "#2") } } % \end{macrocode} % % this inserts on the right of the lua table, but as the lua table is not used for kids % this is ignored for now. % \begin{macrocode} \cs_set_protected:Npn \@@_seq_gput_left:Nn #1 #2 { \seq_gput_left:Nn #1 { #2 } \lua_now:e { table.insert(ltx.@@.tables.\cs_to_str:N#1, "#2") } } %Hm not quite sure about the naming \cs_set:Npn \@@_seq_item:cn #1 #2 { \lua_now:e { tex.print(ltx.@@.tables.#1[#2]) } } \cs_set:Npn \@@_prop_item:cn #1 #2 { \lua_now:e { tex.print(ltx.@@.tables.#1["#2"]) } } %for debugging commands that show both the seq/prop and the lua tables \cs_set_protected:Npn \@@_seq_show:N #1 { \seq_show:N #1 \lua_now:e { ltx.@@.trace.log ("lua~sequence~array~\cs_to_str:N#1",1) } \lua_now:e { ltx.@@.trace.show_seq (ltx.@@.tables.\cs_to_str:N#1) } } \cs_set_protected:Npn \@@_prop_show:N #1 { \prop_show:N #1 \lua_now:e {ltx.@@.trace.log ("lua~property~table~\cs_to_str:N#1",1) } \lua_now:e {ltx.@@.trace.show_prop (ltx.@@.tables.\cs_to_str:N#1) } } % \end{macrocode} % \end{macro} % % \begin{macrocode} % % \end{macrocode} % The module declaration % \begin{macrocode} %<*lua> -- tagpdf.lua -- Ulrike Fischer local ProvidesLuaModule = { name = "tagpdf", version = "0.99l", --TAGVERSION date = "2025-01-12", --TAGDATE description = "tagpdf lua code", license = "The LATEX Project Public License 1.3c" } if luatexbase and luatexbase.provides_module then luatexbase.provides_module (ProvidesLuaModule) end --[[ The code has quite probably a number of problems - more variables should be local instead of global - the naming is not always consistent due to the development of the code - the traversing of the shipout box must be tested with more complicated setups - it should probably handle more node types - --]] % \end{macrocode} % Some comments about the lua structure. % \begin{macrocode} --[[ the main table is named ltx.@@. It contains the functions and also the data collected during the compilation. ltx.@@.mc will contain mc connected data. ltx.@@.struct will contain structure related data. ltx.@@.page will contain page data ltx.@@.tables contains also data from mc and struct (from older code). This needs cleaning up. There are certainly dublettes, but I don't dare yet ... ltx.@@.func will contain (public) functions. ltx.@@.trace will contain tracing/logging functions. local functions starts with __ functions meant for users will be in ltx.tag functions ltx.@@.func.get_num_from (tag): takes a tag (string) and returns the id number ltx.@@.func.output_num_from (tag): takes a tag (string) and prints (to tex) the id number ltx.@@.func.get_tag_from (num): takes a num and returns the tag ltx.@@.func.output_tag_from (num): takes a num and prints (to tex) the tag ltx.@@.func.store_mc_data (num,key,data): stores key=data in ltx.@@.mc[num] ltx.@@.func.store_mc_label (label,num): stores label=num in ltx.@@.mc.labels ltx.@@.func.store_mc_kid (mcnum,kid,page): stores the mc-kids of mcnum on page page ltx.@@.func.store_mc_in_page(mcnum,mcpagecnt,page): stores in the page table the number of mcnum on this page ltx.@@.func.store_struct_mcabs (structnum,mcnum): stores relations structnum<->mcnum (abs) ltx.@@.func.mc_insert_kids (mcnum): inserts the /K entries for mcnum by wandering through the [kids] table ltx.@@.func.mark_page_elements(box,mcpagecnt,mccntprev,mcopen,name,mctypeprev) : the main function ltx.@@.func.mark_shipout (): a wrapper around the core function which inserts the last EMC ltx.@@.func.fill_parent_tree_line (page): outputs the entries of the parenttree for this page ltx.@@.func.output_parenttree(): outputs the content of the parenttree ltx.@@.func.pdf_object_ref(name,index): outputs the object reference for the object name ltx.@@.func.markspaceon(), ltx.@@.func.markspaceoff(): (de)activates the marking of positions for space chars ltx.@@.trace.show_mc_data (num,loglevel): shows ltx.@@.mc[num] is the current log level is >= loglevel ltx.@@.trace.show_all_mc_data (max,loglevel): shows a maximum about mc's if the current log level is >= loglevel ltx.@@.trace.show_seq: shows a sequence (array) ltx.@@.trace.show_struct_data (num): shows data of structure num ltx.@@.trace.show_prop: shows a prop ltx.@@.trace.log ltx.@@.trace.showspaces : boolean ltx.tag.get_structnum: number, shows the current structure number ltx.tag.get_structnum_next: number, shows the next structure number --]] % \end{macrocode} % This set-ups the main attribute registers. % The mc_type attribute stores the type (P, Span etc) encoded as a num, % The mc_cnt attribute stores the absolute number and allows so to see % if a node belongs to the same mc-chunk. % % The interwordspace attr is set by the function |@@_mark_spaces|, and marks % the place where spaces should be inserted. % The interwordfont attr is set by the function |@@_mark_spaces| too and % stores the font, so that we can decide which font % to use for the real space char. % The interwordspaceOff attr allows to locally suppress the insertion of % real space chars, e.g. when they are inserted by other means (e.g. with |\char|). % \begin{macrocode} local mctypeattributeid = luatexbase.new_attribute ("g_@@_mc_type_attr") local mccntattributeid = luatexbase.new_attribute ("g_@@_mc_cnt_attr") local iwspaceOffattributeid = luatexbase.new_attribute ("g__tag_interwordspaceOff_attr") local iwspaceattributeid = luatexbase.new_attribute ("g_@@_interwordspace_attr") local iwfontattributeid = luatexbase.new_attribute ("g_@@_interwordfont_attr") % \end{macrocode} % with this token we can query the state of the boolean % and so detect if unmarked nodes should be marked as attributes % \begin{macrocode} local tagunmarkedbool= token.create("g_@@_tagunmarked_bool") local truebool = token.create("c_true_bool") % \end{macrocode} % with this token we can query the state of the softhyphen boolean % and so detect if hyphens from hyphenation should be replaced by soft-hyphens. % \begin{macrocode} local softhyphenbool = token.create("g_@@_softhyphen_bool") % \end{macrocode} % Now a number of local versions from global tables. % Not all is perhaps needed, most node variants were copied from lua-debug. % \begin{macrocode} local catlatex = luatexbase.registernumber("catcodetable@latex") local tableinsert = table.insert local nodeid = node.id local nodecopy = node.copy local nodegetattribute = node.get_attribute local nodesetattribute = node.set_attribute local nodehasattribute = node.has_attribute local nodenew = node.new local nodetail = node.tail local nodeslide = node.slide local noderemove = node.remove local nodetraverseid = node.traverse_id local nodetraverse = node.traverse local nodeinsertafter = node.insert_after local nodeinsertbefore = node.insert_before local pdfpageref = pdf.pageref local fonthashes = fonts.hashes local identifiers = fonthashes.identifiers local fontid = font.id local HLIST = node.id("hlist") local VLIST = node.id("vlist") local RULE = node.id("rule") local DISC = node.id("disc") local GLUE = node.id("glue") local GLYPH = node.id("glyph") local KERN = node.id("kern") local PENALTY = node.id("penalty") local LOCAL_PAR = node.id("local_par") local MATH = node.id("math") local explicit_disc = 1 local regular_disc = 3 % \end{macrocode} % Now we setup the main table structure. ltx is used by other latex code too! % \begin{macrocode} ltx = ltx or { } ltx.tag = ltx.tag or { } -- user commands ltx.@@ = ltx.@@ or { } ltx.@@.mc = ltx.@@.mc or { } -- mc data ltx.@@.struct = ltx.@@.struct or { } -- struct data ltx.@@.tables = ltx.@@.tables or { } -- tables created with new prop and new seq. -- wasn't a so great idea ... -- g_@@_role_tags_seq used by tag<-> is in this tables! -- used for pure lua tables too now! ltx.@@.page = ltx.@@.page or { } -- page data, currently only i->{0->mcnum,1->mcnum,...} ltx.@@.trace = ltx.@@.trace or { } -- show commands ltx.@@.func = ltx.@@.func or { } -- functions ltx.@@.conf = ltx.@@.conf or { } -- configuration variables % \end{macrocode} % % \section{User commands to access data} % Code like the one in luamml will have to access the current state in some places. % \begin{macro}{\} % \begin{macrocode} local @@_get_struct_num = function() local a = token.get_macro("g__tag_struct_stack_current_tl") return a end local @@_get_struct_counter = function() local a = tex.getcount("c@g_@@_struct_abs_int") return a end local @@_get_struct_num_next = function() local a = tex.getcount("c@g_@@_struct_abs_int") + 1 return a end ltx.tag.get_struct_num = @@_get_struct_num ltx.tag.get_struct_counter = @@_get_struct_counter ltx.tag.get_struct_num_next = @@_get_struct_num_next % \end{macrocode} % \end{macro} % \section{Logging functions} % % \begin{macro}{@@_log,ltx.@@.trace.log} % This rather simple log function takes as argument a % message (string) and a number and % will output the message to the log/terminal if the current loglevel % is greater or equal than num. % \begin{macrocode} local @@_log = function (message,loglevel) if (loglevel or 3) <= tex.count["l_@@_loglevel_int"] then texio.write_nl("tagpdf: ".. message) end end ltx.@@.trace.log = @@_log % \end{macrocode} % \end{macro} % \begin{macro}{ltx.@@.trace.show_seq} % This shows the content of a seq as stored in the tables table. % It is used by the |\@@_seq_show:N| function. It is not used % in user commands, only for debugging, and so requires log level >0. % \begin{macrocode} function ltx.@@.trace.show_seq (seq) if (type(seq) == "table") then for i,v in ipairs(seq) do @@_log ("[" .. i .. "] => " .. tostring(v),1) end else @@_log ("sequence " .. tostring(seq) .. " not found",1) end end % \end{macrocode} % \end{macro} % \begin{macro}{ @@_pairs_prop,ltx.@@.trace.show_prop} % This shows the content of a prop as stored in the tables table. % It is used by the |\@@_prop_show:N| function. % \begin{macrocode} local @@_pairs_prop = function (prop) local a = {} for n in pairs(prop) do tableinsert(a, n) end table.sort(a) local i = 0 -- iterator variable local iter = function () -- iterator function i = i + 1 if a[i] == nil then return nil else return a[i], prop[a[i]] end end return iter end function ltx.@@.trace.show_prop (prop) if (type(prop) == "table") then for i,v in @@_pairs_prop (prop) do @@_log ("[" .. i .. "] => " .. tostring(v),1) end else @@_log ("prop " .. tostring(prop) .. " not found or not a table",1) end end % \end{macrocode} % \end{macro} % \begin{macro}{ltx.@@.trace.show_mc_data} % This shows some data for a mc given by |num|. % If something is shown depends on the log level. % The function is used by the following function and then in % |\ShowTagging| % \begin{macrocode} function ltx.@@.trace.show_mc_data (num,loglevel) if ltx.@@ and ltx.@@.mc and ltx.@@.mc[num] then for k,v in pairs(ltx.@@.mc[num]) do @@_log ("mc"..num..": "..tostring(k).."=>"..tostring(v),loglevel) end if ltx.@@.mc[num]["kids"] then @@_log ("mc" .. num .. " has " .. #ltx.@@.mc[num]["kids"] .. " kids",loglevel) for k,v in ipairs(ltx.@@.mc[num]["kids"]) do @@_log ("mc ".. num .. " kid "..k.." =>" .. v.kid.." on page " ..v.page,loglevel) end end else @@_log ("mc"..num.." not found",loglevel) end end % \end{macrocode} % \end{macro} % \begin{macro}{ltx.@@.trace.show_all_mc_data} % This shows data for the mc's between |min| and |max| (numbers). % It is used by the |\ShowTagging| function. % \begin{macrocode} function ltx.@@.trace.show_all_mc_data (min,max,loglevel) for i = min, max do ltx.@@.trace.show_mc_data (i,loglevel) end texio.write_nl("") end % \end{macrocode} % \end{macro} % \begin{macro} % { % ltx.@@.trace.show_struct_data % } % This function shows some struct data. % Unused but kept for debugging. % \begin{macrocode} function ltx.@@.trace.show_struct_data (num) if ltx.@@ and ltx.@@.struct and ltx.@@.struct[num] then for k,v in ipairs(ltx.@@.struct[num]) do @@_log ("struct "..num..": "..tostring(k).."=>"..tostring(v),1) end else @@_log ("struct "..num.." not found ",1) end end % \end{macrocode} % \end{macro} % % \section{Helper functions} % \subsection{Retrieve data functions} % \begin{macro}{@@_get_mc_cnt_type_tag} % This takes a node as argument and returns the mc-cnt, the mc-type and % and the tag (calculated from the mc-cnt. % \begin{macrocode} local @@_get_mc_cnt_type_tag = function (n) local mccnt = nodegetattribute(n,mccntattributeid) or -1 local mctype = nodegetattribute(n,mctypeattributeid) or -1 local tag = ltx.@@.func.get_tag_from(mctype) return mccnt,mctype,tag end % \end{macrocode} % \end{macro} % % \begin{macro}{@@_get_mathsubtype} % This function allows to detect if we are at the begin or the end of math. % It takes as argument a mathnode. % \begin{macrocode} local function @@_get_mathsubtype (mathnode) if mathnode.subtype == 0 then subtype = "beginmath" else subtype = "endmath" end return subtype end % \end{macrocode} % \end{macro} % % \begin{variable}{ltx.@@.tables.role_tag_attribute,ltx.@@.tables.role_tag_attribute} % The first is a table with key a tag and value a number (the attribute) % The second is an array with the attribute value as key. % \begin{macrocode} ltx.@@.tables.role_tag_attribute = {} ltx.@@.tables.role_attribute_tag = {} % \end{macrocode} % \end{variable} % \begin{macro}{ltx.@@.func.alloctag} % \begin{macrocode} local @@_alloctag = function (tag) if not ltx.@@.tables.role_tag_attribute[tag] then table.insert(ltx.@@.tables.role_attribute_tag,tag) ltx.@@.tables.role_tag_attribute[tag]=#ltx.@@.tables.role_attribute_tag @@_log ("Add "..tag.." "..ltx.@@.tables.role_tag_attribute[tag],3) end end ltx.@@.func.alloctag = @@_alloctag % \end{macrocode} % \end{macro} % \begin{macro} % { % @@_get_num_from, % ltx.@@.func.get_num_from, % ltx.@@.func.output_num_from % } % These functions take as argument a string |tag|, and return the number % under which is it recorded (and so the attribute value). % The first function outputs the number for lua, while the |output| function % outputs to tex. % \begin{macrocode} local @@_get_num_from = function (tag) if ltx.@@.tables.role_tag_attribute[tag] then a= ltx.@@.tables.role_tag_attribute[tag] else a= -1 end return a end ltx.@@.func.get_num_from = @@_get_num_from function ltx.@@.func.output_num_from (tag) local num = @@_get_num_from (tag) tex.sprint(catlatex,num) if num == -1 then @@_log ("Unknown tag "..tag.." used") end end % \end{macrocode} % \end{macro} % % \begin{macro} % { % @@_get_tag_from , % ltx.@@.func.get_tag_from, % ltx.@@.func.output_tag_from % } % These functions are the opposites to the previous function: % they take as argument a number (the attribute value) and return the string |tag|. % The first function outputs the string for lua, while the |output| function % outputs to tex. % \begin{macrocode} local @@_get_tag_from = function (num) if ltx.@@.tables.role_attribute_tag[num] then a = ltx.@@.tables.role_attribute_tag[num] else a= "UNKNOWN" end return a end ltx.@@.func.get_tag_from = @@_get_tag_from function ltx.@@.func.output_tag_from (num) tex.sprint(catlatex,@@_get_tag_from (num)) end % \end{macrocode} % \end{macro} % \begin{macro} % { % ltx.@@.func.store_mc_data % } % This function stores for |key|=|data| for mc-chunk |num|. It is used in the % tagpdf-mc code, to store for example the tag string, and the raw options. % \begin{macrocode} function ltx.@@.func.store_mc_data (num,key,data) ltx.@@.mc[num] = ltx.@@.mc[num] or { } ltx.@@.mc[num][key] = data @@_log ("INFO TEX-STORE-MC-DATA: "..num.." => "..tostring(key).." => "..tostring(data),3) end % \end{macrocode} % \end{macro} % % \begin{macro} % { % ltx.@@.func.store_mc_label % } % This function stores the |label|=|num| relationship in the |labels| subtable. % TODO: this is probably unused and can go. % \begin{macrocode} function ltx.@@.func.store_mc_label (label,num) ltx.@@.mc["labels"] = ltx.@@.mc["labels"] or { } ltx.@@.mc.labels[label] = num end % \end{macrocode} % \end{macro} % \begin{macro} % { % ltx.@@.func.store_mc_kid % } % This function is used in the traversing code. It stores % a sub-chunk of a mc |mcnum| into the |kids| table. % \begin{macrocode} function ltx.@@.func.store_mc_kid (mcnum,kid,page) ltx.@@.trace.log("INFO TAG-STORE-MC-KID: "..mcnum.." => " .. kid.." on page " .. page,3) ltx.@@.mc[mcnum]["kids"] = ltx.@@.mc[mcnum]["kids"] or { } local kidtable = {kid=kid,page=page} tableinsert(ltx.@@.mc[mcnum]["kids"], kidtable ) end % \end{macrocode} % \end{macro} % \begin{macro} % { % ltx.@@.func.mc_num_of_kids % } % This function returns the number of kids a mc |mcnum| has. We need to account for % the case that a mc can have no kids. % \begin{macrocode} function ltx.@@.func.mc_num_of_kids (mcnum) local num = 0 if ltx.@@.mc[mcnum] and ltx.@@.mc[mcnum]["kids"] then num = #ltx.@@.mc[mcnum]["kids"] end ltx.@@.trace.log ("INFO MC-KID-NUMBERS: " .. mcnum .. "has " .. num .. "KIDS",4) return num end % \end{macrocode} % \end{macro} % \subsection{Functions to insert the pdf literals} % \begin{macro}{@@_backend_create_emc_node,@@_insert_emc_node} % This insert the emc node. We support also dvips and dvipdfmx backend % \begin{macrocode} local @@_backend_create_emc_node if tex.outputmode == 0 then if token.get_macro("c_sys_backend_str") == "dvipdfmx" then function @@_backend_create_emc_node () local emcnode = nodenew("whatsit","special") emcnode.data = "pdf:code EMC" return emcnode end else -- assume a dvips variant function @@_backend_create_emc_node () local emcnode = nodenew("whatsit","special") emcnode.data = "ps:SDict begin mark /EMC pdfmark end" return emcnode end end else -- pdf mode function @@_backend_create_emc_node () local emcnode = nodenew("whatsit","pdf_literal") emcnode.data = "EMC" emcnode.mode=1 return emcnode end end local function @@_insert_emc_node (head,current) local emcnode= @@_backend_create_emc_node() head = node.insert_before(head,current,emcnode) return head end % \end{macrocode} % \end{macro} % \begin{macro}{@@_backend_create_bmc_node,@@_insert_bmc_node} % This inserts a simple bmc node % \begin{macrocode} local @@_backend_create_bmc_node if tex.outputmode == 0 then if token.get_macro("c_sys_backend_str") == "dvipdfmx" then function @@_backend_create_bmc_node (tag) local bmcnode = nodenew("whatsit","special") bmcnode.data = "pdf:code /"..tag.." BMC" return bmcnode end else -- assume a dvips variant function @@_backend_create_bmc_node (tag) local bmcnode = nodenew("whatsit","special") bmcnode.data = "ps:SDict begin mark/"..tag.." /BMC pdfmark end" return bmcnode end end else -- pdf mode function @@_backend_create_bmc_node (tag) local bmcnode = nodenew("whatsit","pdf_literal") bmcnode.data = "/"..tag.." BMC" bmcnode.mode=1 return bmcnode end end local function @@_insert_bmc_node (head,current,tag) local bmcnode = @@_backend_create_bmc_node (tag) head = node.insert_before(head,current,bmcnode) return head end % \end{macrocode} % \end{macro} % \begin{macro}{@@_backend_create_bdc_node,@@_insert_bdc_node} % This inserts a bcd node with a fix dict. % TODO: check if this is still used, now that we create properties. % \begin{macrocode} local @@_backend_create_bdc_node if tex.outputmode == 0 then if token.get_macro("c_sys_backend_str") == "dvipdfmx" then function @@_backend_create_bdc_node (tag,dict) local bdcnode = nodenew("whatsit","special") bdcnode.data = "pdf:code /"..tag.."<<"..dict..">> BDC" return bdcnode end else -- assume a dvips variant function @@_backend_create_bdc_node (tag,dict) local bdcnode = nodenew("whatsit","special") bdcnode.data = "ps:SDict begin mark/"..tag.."<<"..dict..">> /BDC pdfmark end" return bdcnode end end else -- pdf mode function @@_backend_create_bdc_node (tag,dict) local bdcnode = nodenew("whatsit","pdf_literal") bdcnode.data = "/"..tag.."<<"..dict..">> BDC" bdcnode.mode=1 return bdcnode end end local function @@_insert_bdc_node (head,current,tag,dict) bdcnode= @@_backend_create_bdc_node (tag,dict) head = node.insert_before(head,current,bdcnode) return head end % \end{macrocode} % \end{macro} % \begin{macro}{@@_pdf_object_ref} % This allows to reference a pdf object reserved with the l3pdf command by name. % The return value is |n 0 R|, if the object doesn't exist, n is 0. % \begin{macrocode} local function @@_pdf_object_ref (name,index) local object if ltx.pdf.object_id then object = ltx.pdf.object_id (name,index) ..' 0 R' else local tokenname = 'c__pdf_object_'..name..'/'..index..'_int' object = token.create(tokenname).mode ..' 0 R' end return object end ltx.@@.func.pdf_object_ref = @@_pdf_object_ref % \end{macrocode} % \end{macro} % % \section{Function for the real space chars} % \begin{macro}{@@_show_spacemark} % A debugging function, it is used to % inserts red color markers in the places where space chars can go, it can have % side effects so not always reliable, but ok. % \begin{macrocode} local function @@_show_spacemark (head,current,color,height) local markcolor = color or "1 0 0" local markheight = height or 10 local pdfstring if tex.outputmode == 0 then -- ignore dvi mode for now else pdfstring = node.new("whatsit","pdf_literal") pdfstring.data = string.format("q "..markcolor.." RG "..markcolor.." rg 0.4 w 0 %g m 0 %g l S Q",-3,markheight) head = node.insert_after(head,current,pdfstring) return head end end % \end{macrocode} % \end{macro} % \begin{macro}{@@_fakespace,ltx.@@.func.fakespace} % This is used to define a lua version of |\pdffakespace| % \begin{macrocode} local function @@_fakespace() tex.setattribute(iwspaceattributeid,1) tex.setattribute(iwfontattributeid,font.current()) end ltx.@@.func.fakespace = @@_fakespace % \end{macrocode} % \end{macro} % \begin{macro}{@@_mark_spaces} % a function to mark up places where real space chars should be inserted. % It only sets attributes, these are then be used in a later traversing % which inserts the actual spaces. % When space handling is activated this function is inserted in some callbacks. % \begin{macrocode} --[[ a function to mark up places where real space chars should be inserted it only sets an attribute. --]] local function @@_mark_spaces (head) local inside_math = false for n in nodetraverse(head) do local id = n.id if id == GLYPH then local glyph = n default_currfontid = glyph.font if glyph.next and (glyph.next.id == GLUE) and not inside_math and (glyph.next.width >0) then nodesetattribute(glyph.next,iwspaceattributeid,1) nodesetattribute(glyph.next,iwfontattributeid,glyph.font) -- for debugging if ltx.@@.trace.showspaces then @@_show_spacemark (head,glyph) end elseif glyph.next and (glyph.next.id==KERN) and not inside_math then local kern = glyph.next if kern.next and (kern.next.id== GLUE) and (kern.next.width >0) then nodesetattribute(kern.next,iwspaceattributeid,1) nodesetattribute(kern.next,iwfontattributeid,glyph.font) end end -- look also back if glyph.prev and (glyph.prev.id == GLUE) and not inside_math and (glyph.prev.width >0) and not nodehasattribute(glyph.prev,iwspaceattributeid) then nodesetattribute(glyph.prev,iwspaceattributeid,1) nodesetattribute(glyph.prev,iwfontattributeid,glyph.font) -- for debugging if ltx.@@.trace.showspaces then @@_show_spacemark (head,glyph) end end elseif id == PENALTY then local glyph = n -- ltx.@@.trace.log ("PENALTY ".. n.subtype.."VALUE"..n.penalty,3) if glyph.next and (glyph.next.id == GLUE) and not inside_math and (glyph.next.width >0) and n.subtype==0 then nodesetattribute(glyph.next,iwspaceattributeid,1) -- changed 2024-01-18, issue #72 nodesetattribute(glyph.next,iwfontattributeid,default_currfontid) -- for debugging if ltx.@@.trace.showspaces then @@_show_spacemark (head,glyph) end end elseif id == MATH then inside_math = (n.subtype == 0) end end return head end % \end{macrocode} % \end{macro} % \begin{macro} % { % @@_activate_mark_space, % ltx.@@.func.markspaceon, % @@_activate_mark_space, % ltx.@@.func.markspaceoff % } % These functions add/remove the function which marks the spaces to the callbacks % |pre_linebreak_filter| and |hpack_filter| % \begin{macrocode} local function @@_activate_mark_space () if not luatexbase.in_callback ("pre_linebreak_filter","markspaces") then luatexbase.add_to_callback("pre_linebreak_filter",@@_mark_spaces,"markspaces") luatexbase.add_to_callback("hpack_filter",@@_mark_spaces,"markspaces") end end ltx.@@.func.markspaceon=@@_activate_mark_space local function @@_deactivate_mark_space () if luatexbase.in_callback ("pre_linebreak_filter","markspaces") then luatexbase.remove_from_callback("pre_linebreak_filter","markspaces") luatexbase.remove_from_callback("hpack_filter","markspaces") end end ltx.@@.func.markspaceoff=@@_deactivate_mark_space % \end{macrocode} % \end{macro} % We need two local variable to setup a default space char. % \begin{macrocode} local default_space_char = nodenew(GLYPH) local default_fontid = fontid("TU/lmr/m/n/10") local default_currfontid = fontid("TU/lmr/m/n/10") default_space_char.char = 32 default_space_char.font = default_fontid % \end{macrocode} % And a function to check as best as possible if a font has a space: % \begin{macrocode} local function @@_font_has_space (fontid) t= fonts.hashes.identifiers[fontid] if luaotfload.aux.slot_of_name(fontid,"space") or t.characters and t.characters[32] and t.characters[32]["unicode"]==32 then return true else return false end end % \end{macrocode} % \begin{macro} % { % @@_space_chars_shipout, % ltx.@@.func.space_chars_shipout, % } % These is the main function to insert real space chars. It inserts a % glyph before every glue which has been marked previously. The attributes % are copied from the glue, so if the tagging is done later, % it will be tagged like it. % \begin{macrocode} local function @@_space_chars_shipout (box) local head = box.head if head then for n in node.traverse(head) do local spaceattr = -1 if not nodehasattribute(n,iwspaceOffattributeid) then spaceattr = nodegetattribute(n,iwspaceattributeid) or -1 end if n.id == HLIST then -- enter the hlist @@_space_chars_shipout (n) elseif n.id == VLIST then -- enter the vlist @@_space_chars_shipout (n) elseif n.id == GLUE then if ltx.@@.trace.showspaces and spaceattr==1 then @@_show_spacemark (head,n,"0 1 0") end if spaceattr==1 then local space local space_char = node.copy(default_space_char) local curfont = nodegetattribute(n,iwfontattributeid) ltx.@@.trace.log ("INFO SPACE-FUNCTION-FONT: ".. tostring(curfont),3) if curfont and -- luaotfload.aux.slot_of_name(curfont,"space") @@_font_has_space (curfont) then space_char.font=curfont end head, space = node.insert_before(head, n, space_char) -- n.width = n.width - space.width space.attr = n.attr end end end box.head = head end end function ltx.@@.func.space_chars_shipout (box) @@_space_chars_shipout (box) end % \end{macrocode} % \end{macro} % % \section{Function for the tagging} % \begin{macro} % { % ltx.@@.func.mc_insert_kids % } % This is the main function to insert the % K entry into a StructElem object. It is used in tagpdf-mc-luacode module. % The |single| attribute allows to handle the case that a single % mc on the tex side can have more than one kid after the processing here, % and so we get the correct array/non array setup. % \begin{macrocode} function ltx.@@.func.mc_insert_kids (mcnum,single) if ltx.@@.mc[mcnum] then ltx.@@.trace.log("INFO TEX-MC-INSERT-KID-TEST: " .. mcnum,4) if ltx.@@.mc[mcnum]["kids"] then if #ltx.@@.mc[mcnum]["kids"] > 1 and single==1 then tex.sprint("[") end for i,kidstable in ipairs( ltx.@@.mc[mcnum]["kids"] ) do local kidnum = kidstable["kid"] local kidpage = kidstable["page"] local kidpageobjnum = pdfpageref(kidpage) ltx.@@.trace.log("INFO TEX-MC-INSERT-KID: " .. mcnum .. " insert KID " ..i.. " with num " .. kidnum .. " on page " .. kidpage.."/"..kidpageobjnum,3) tex.sprint(catlatex,"<> " ) end if #ltx.@@.mc[mcnum]["kids"] > 1 and single==1 then tex.sprint("]") end else -- this is typically not a problem, e.g. empty hbox in footer/header can -- trigger this warning. ltx.@@.trace.log("WARN TEX-MC-INSERT-NO-KIDS: "..mcnum.." has no kids",2) if single==1 then tex.sprint("null") end end else ltx.@@.trace.log("WARN TEX-MC-INSERT-MISSING: "..mcnum.." doesn't exist",0) end end % \end{macrocode} % \end{macro} % \begin{macro}{ltx.@@.func.store_struct_mcabs} % This function is used in the tagpdf-mc-luacode. It store the absolute count % of the mc into the current structure. This must be done ordered. % \begin{macrocode} function ltx.@@.func.store_struct_mcabs (structnum,mcnum) ltx.@@.struct[structnum]=ltx.@@.struct[structnum] or { } ltx.@@.struct[structnum]["mc"]=ltx.@@.struct[structnum]["mc"] or { } -- a structure can contain more than on mc chunk, the content should be ordered tableinsert(ltx.@@.struct[structnum]["mc"],mcnum) ltx.@@.trace.log("INFO TEX-MC-INTO-STRUCT: ".. mcnum.." inserted in struct "..structnum,3) -- but every mc can only be in one structure ltx.@@.mc[mcnum]= ltx.@@.mc[mcnum] or { } ltx.@@.mc[mcnum]["parent"] = structnum end % \end{macrocode} % \end{macro} % \begin{macro}{ltx.@@.func.store_mc_in_page} % This is used in the traversing code and stores the relation between % abs count and page count. % \begin{macrocode} -- pay attention: lua counts arrays from 1, tex pages from one -- mcid and arrays in pdf count from 0. function ltx.@@.func.store_mc_in_page (mcnum,mcpagecnt,page) ltx.@@.page[page] = ltx.@@.page[page] or {} ltx.@@.page[page][mcpagecnt] = mcnum ltx.@@.trace.log("INFO TAG-MC-INTO-PAGE: page " .. page .. ": inserting MCID " .. mcpagecnt .. " => " .. mcnum,3) end % \end{macrocode} % \end{macro} % % \begin{macro}{ltx.@@.func.update_mc_attributes} % This updates the mc-attributes of a box. It should only be used on boxes % which don't contain structure elements. % The arguments are a box, the mc-num and the type (as a number) % \begin{macrocode} local function @@_update_mc_attributes (head,mcnum,type) for n in node.traverse(head) do node.set_attribute(n,mccntattributeid,mcnum) node.set_attribute(n,mctypeattributeid,type) if n.id == HLIST or n.id == VLIST then @@_update_mc_attributes (n.list,mcnum,type) end end return head end ltx.@@.func.update_mc_attributes = @@_update_mc_attributes % \end{macrocode} % \end{macro} % % \begin{macro}{ltx.@@.func.mark_page_elements} % This is the main traversing function. See the lua comment for more details. % \begin{macrocode} --[[ Now follows the core function It wades through the shipout box and checks the attributes ARGUMENTS box: is a box, mcpagecnt: num, the current page cnt of mc (should start at -1 in shipout box), needed for recursion mccntprev: num, the attribute cnt of the previous node/whatever - if different we have a chunk border mcopen: num, records if some bdc/emc is open These arguments are only needed for log messages, if not present are replaces by fix strings: name: string to describe the box mctypeprev: num, the type attribute of the previous node/whatever there are lots of logging messages currently. Should be cleaned up in due course. One should also find ways to make the function shorter. --]] function ltx.@@.func.mark_page_elements (box,mcpagecnt,mccntprev,mcopen,name,mctypeprev) local name = name or ("SOMEBOX") local mctypeprev = mctypeprev or -1 local abspage = status.total_pages + 1 -- the real counter is increased -- inside the box so one off -- if the callback is not used. (???) ltx.@@.trace.log ("INFO TAG-ABSPAGE: " .. abspage,3) ltx.@@.trace.log ("INFO TAG-ARGS: pagecnt".. mcpagecnt.. " prev "..mccntprev .. " type prev "..mctypeprev,4) ltx.@@.trace.log ("INFO TAG-TRAVERSING-BOX: ".. tostring(name).. " TYPE ".. node.type(node.getid(box)),3) local head = box.head -- ShipoutBox is a vlist? if head then mccnthead, mctypehead,taghead = @@_get_mc_cnt_type_tag (head) ltx.@@.trace.log ("INFO TAG-HEAD: " .. node.type(node.getid(head)).. " MC"..tostring(mccnthead).. " => TAG " .. tostring(mctypehead).. " => ".. tostring(taghead),3) else ltx.@@.trace.log ("INFO TAG-NO-HEAD: head is ".. tostring(head),3) end for n in node.traverse(head) do local mccnt, mctype, tag = @@_get_mc_cnt_type_tag (n) local spaceattr = nodegetattribute(n,iwspaceattributeid) or -1 ltx.@@.trace.log ("INFO TAG-NODE: ".. node.type(node.getid(n)).. " MC".. tostring(mccnt).. " => TAG ".. tostring(mctype).. " => " .. tostring(tag),3) if n.id == HLIST then -- enter the hlist mcopen,mcpagecnt,mccntprev,mctypeprev= ltx.@@.func.mark_page_elements (n,mcpagecnt,mccntprev,mcopen,"INTERNAL HLIST",mctypeprev) elseif n.id == VLIST then -- enter the vlist mcopen,mcpagecnt,mccntprev,mctypeprev= ltx.@@.func.mark_page_elements (n,mcpagecnt,mccntprev,mcopen,"INTERNAL VLIST",mctypeprev) elseif n.id == GLUE and not n.leader then -- at glue real space chars are inserted, but this has -- been done if the previous shipout wandering, so here it is ignored elseif n.id == LOCAL_PAR then -- local_par is ignored elseif n.id == PENALTY then -- penalty is ignored elseif n.id == KERN then -- kern is ignored ltx.@@.trace.log ("INFO TAG-KERN-SUBTYPE: ".. node.type(node.getid(n)).." "..n.subtype,4) else -- math is currently only logged. -- we could mark the whole as math -- for inner processing the mlist_to_hlist callback is probably needed. if n.id == MATH then ltx.@@.trace.log("INFO TAG-MATH-SUBTYPE: ".. node.type(node.getid(n)).." "..@@_get_mathsubtype(n),4) end -- endmath ltx.@@.trace.log("INFO TAG-MC-COMPARE: current ".. mccnt.." prev "..mccntprev,4) if mccnt~=mccntprev then -- a new mc chunk ltx.@@.trace.log ("INFO TAG-NEW-MC-NODE: ".. node.type(node.getid(n)).. " MC"..tostring(mccnt).. " <=> PREVIOUS "..tostring(mccntprev),4) if mcopen~=0 then -- there is a chunk open, close it (hope there is only one ... box.list=@@_insert_emc_node (box.list,n) mcopen = mcopen - 1 ltx.@@.trace.log ("INFO TAG-INSERT-EMC: " .. mcpagecnt .. " MCOPEN = " .. mcopen,3) if mcopen ~=0 then ltx.@@.trace.log ("WARN TAG-OPEN-MC: " .. mcopen,1) end end if ltx.@@.mc[mccnt] then if ltx.@@.mc[mccnt]["artifact"] then ltx.@@.trace.log("INFO TAG-INSERT-ARTIFACT: ".. tostring(ltx.@@.mc[mccnt]["artifact"]),3) if ltx.@@.mc[mccnt]["artifact"] == "" then box.list = @@_insert_bmc_node (box.list,n,"Artifact") else box.list = @@_insert_bdc_node (box.list,n,"Artifact", "/Type /"..ltx.@@.mc[mccnt]["artifact"]) end else ltx.@@.trace.log("INFO TAG-INSERT-TAG: ".. tostring(tag),3) mcpagecnt = mcpagecnt +1 ltx.@@.trace.log ("INFO TAG-INSERT-BDC: "..mcpagecnt,3) local dict= "/MCID "..mcpagecnt if ltx.@@.mc[mccnt]["raw"] then ltx.@@.trace.log("INFO TAG-USE-RAW: ".. tostring(ltx.@@.mc[mccnt]["raw"]),3) dict= dict .. " " .. ltx.@@.mc[mccnt]["raw"] end if ltx.@@.mc[mccnt]["alt"] then ltx.@@.trace.log("INFO TAG-USE-ALT: ".. tostring(ltx.@@.mc[mccnt]["alt"]),3) dict= dict .. " " .. ltx.@@.mc[mccnt]["alt"] end if ltx.@@.mc[mccnt]["actualtext"] then ltx.@@.trace.log("INFO TAG-USE-ACTUALTEXT: ".. tostring(ltx.@@.mc[mccnt]["actualtext"]),3) dict= dict .. " " .. ltx.@@.mc[mccnt]["actualtext"] end box.list = @@_insert_bdc_node (box.list,n,tag, dict) ltx.@@.func.store_mc_kid (mccnt,mcpagecnt,abspage) ltx.@@.func.store_mc_in_page(mccnt,mcpagecnt,abspage) ltx.@@.trace.show_mc_data (mccnt,3) end mcopen = mcopen + 1 else if tagunmarkedbool.mode == truebool.mode then ltx.@@.trace.log("INFO TAG-NOT-TAGGED: this has not been tagged, using artifact",2) box.list = @@_insert_bmc_node (box.list,n,"Artifact") mcopen = mcopen + 1 else ltx.@@.trace.log("WARN TAG-NOT-TAGGED: this has not been tagged",1) end end mccntprev = mccnt end end -- end if end -- end for if head then mccnthead, mctypehead,taghead = @@_get_mc_cnt_type_tag (head) ltx.@@.trace.log ("INFO TAG-ENDHEAD: " .. node.type(node.getid(head)).. " MC"..tostring(mccnthead).. " => TAG "..tostring(mctypehead).. " => "..tostring(taghead),4) else ltx.@@.trace.log ("INFO TAG-ENDHEAD: ".. tostring(head),4) end ltx.@@.trace.log ("INFO TAG-QUITTING-BOX ".. tostring(name).. " TYPE ".. node.type(node.getid(box)),4) return mcopen,mcpagecnt,mccntprev,mctypeprev end % \end{macrocode} % \end{macro} % \begin{macro}{ltx.@@.func.mark_shipout} % This is the function used in the callback. Beside calling the traversing % function it also checks if there is an open MC-chunk from a page % break and insert the needed EMC literal. % \begin{macrocode} function ltx.@@.func.mark_shipout (box) mcopen = ltx.@@.func.mark_page_elements (box,-1,-100,0,"Shipout",-1) if mcopen~=0 then -- there is a chunk open, close it (hope there is only one ... local emcnode = @@_backend_create_emc_node () local list = box.list if list then list = node.insert_after (list,node.tail(list),emcnode) mcopen = mcopen - 1 ltx.@@.trace.log ("INFO SHIPOUT-INSERT-LAST-EMC: MCOPEN " .. mcopen,3) else ltx.@@.trace.log ("WARN SHIPOUT-UPS: this shouldn't happen",0) end if mcopen ~=0 then ltx.@@.trace.log ("WARN SHIPOUT-MC-OPEN: " .. mcopen,1) end end end % \end{macrocode} % \end{macro} % \section{Parenttree} % \begin{macro} % { % ltx.@@.func.fill_parent_tree_line, % ltx.@@.func.output_parenttree % } % These functions create the parent tree. The second, main function % is used in the tagpdf-tree code. % TODO check if the tree code can move into the backend code. % \begin{macrocode} function ltx.@@.func.fill_parent_tree_line (page) -- we need to get page-> i=kid -> mcnum -> structnum -- pay attention: the kid numbers and the page number in the parent tree start with 0! local numsentry ="" local pdfpage = page-1 if ltx.@@.page[page] and ltx.@@.page[page][0] then mcchunks=#ltx.@@.page[page] ltx.@@.trace.log("INFO PARENTTREE-NUM: page ".. page.." has "..mcchunks.."+1 Elements ",4) for i=0,mcchunks do -- what does this log?? ltx.@@.trace.log("INFO PARENTTREE-CHUNKS: ".. ltx.@@.page[page][i],4) end if mcchunks == 0 then -- only one chunk so no need for an array local mcnum = ltx.@@.page[page][0] local structnum = ltx.@@.mc[mcnum]["parent"] local propname = "g_@@_struct_"..structnum.."_prop" --local objref = ltx.@@.tables[propname]["objref"] or "XXXX" local objref = @@_pdf_object_ref('@@/struct',structnum) ltx.@@.trace.log("INFO PARENTTREE-STRUCT-OBJREF: =====>".. tostring(objref),5) numsentry = pdfpage .. " [".. objref .. "]" ltx.@@.trace.log("INFO PARENTTREE-NUMENTRY: page " .. page.. " num entry = ".. numsentry,3) else numsentry = pdfpage .. " [" for i=0,mcchunks do local mcnum = ltx.@@.page[page][i] local structnum = ltx.@@.mc[mcnum]["parent"] or 0 local propname = "g_@@_struct_"..structnum.."_prop" --local objref = ltx.@@.tables[propname]["objref"] or "XXXX" local objref = @@_pdf_object_ref('@@/struct',structnum) numsentry = numsentry .. " ".. objref end numsentry = numsentry .. "] " ltx.@@.trace.log("INFO PARENTTREE-NUMENTRY: page " .. page.. " num entry = ".. numsentry,3) end else ltx.@@.trace.log ("INFO PARENTTREE-NO-DATA: page "..page,3) numsentry = pdfpage.." []" end return numsentry end function ltx.@@.func.output_parenttree (abspage) for i=1,abspage do line = ltx.@@.func.fill_parent_tree_line (i) .. "^^J" tex.sprint(catlatex,line) end end % \end{macrocode} % \end{macro} % % \begin{macro} % { % process_softhyphen_pre % process_softhyphen_post % } % First some local definitions. Since these are only needed locally everything gets wrapped into a block. % \begin{macrocode} do local properties = node.get_properties_table() local is_soft_hyphen_prop = 'tagpdf.rewrite-softhyphen.is_soft_hyphen' local hyphen_char = 0x2D local soft_hyphen_char = 0xAD % \end{macrocode} % % A lookup table to test if the font supports the soft hyphen glyph. % \begin{macrocode} local softhyphen_fonts = setmetatable({}, {__index = function(t, fid) local fdir = identifiers[fid] local format = fdir and fdir.format local result = (format == 'opentype' or format == 'truetype') local characters = fdir and fdir.characters result = result and (characters and characters[soft_hyphen_char]) ~= nil t[fid] = result return result end}) % \end{macrocode} % % A pre shaping callback to mark hyphens as being hyphenation hyphens. % This runs before shaping to avoid affecting hyphens moved into % discretionaries during shaping. % \begin{macrocode} local function process_softhyphen_pre(head, _context, _dir) if softhyphenbool.mode ~= truebool.mode then return true end for disc, sub in node.traverse_id(DISC, head) do if sub == explicit_disc or sub == regular_disc then for n, _ch, _f in node.traverse_char(disc.pre) do local props = properties[n] if not props then props = {} properties[n] = props end props[is_soft_hyphen_prop] = true end end end return true end % \end{macrocode} % % Finally do the actual replacement after shaping. No checking for double processing here % since the operation is idempotent. % \begin{macrocode} local function process_softhyphen_post(head, _context, _dir) if softhyphenbool.mode ~= truebool.mode then return true end for disc, sub in node.traverse_id(DISC, head) do for n, ch, fid in node.traverse_glyph(disc.pre) do local props = properties[n] if softhyphen_fonts[fid] and ch == hyphen_char and props and props[is_soft_hyphen_prop] then n.char = soft_hyphen_char props.glyph_info = nil end end end return true end luatexbase.add_to_callback('pre_shaping_filter', process_softhyphen_pre, 'tagpdf.rewrite-softhyphen') luatexbase.add_to_callback('post_shaping_filter', process_softhyphen_post, 'tagpdf.rewrite-softhyphen') end % \end{macrocode} % \end{macro} % % \begin{macrocode} % % \end{macrocode} % \end{implementation} % \PrintIndex