diff --git a/test_split.js b/test_split.js deleted file mode 100755 index 802c5ae..0000000 --- a/test_split.js +++ /dev/null @@ -1,134 +0,0 @@ -const DE_STEMS = [ - 'mit','und','oder','für','vom','zum','zur','gebraten','kartoffel','gemüse','suppe', - 'kuchen','schwein','rind','hähnchen','huhn','fisch','nudel','soße','sosse','wurst', - 'kürbis','braten','sahne','apfel','käse','fleisch','pilz','kirsch','joghurt','spätzle', - 'knödel','kraut','schnitzel','püree','rahm','erdbeer','schoko','vanille','tomate', - 'gurke','salat','zwiebel','paprika','reis','bohne','erbse','karotte','möhre','lauch', - 'knoblauch','chili','gewürz','kräuter','pfeffer','salz','butter','milch','eier', - 'pfanne','auflauf','gratin','ragout','gulasch','eintopf','filet','steak','brust', - 'salami','schinken','speck','brokkoli','blumenkohl','zucchini','aubergine', - 'spinat','spargel','olive','mandel','nuss','honig','senf','essig','öl','brot', - 'brötchen','pfannkuchen','eis','torte','dessert','kompott','obst','frucht','beere', - 'plunder', 'dip' -]; -const EN_STEMS = [ - 'with','and','or','for','from','to','fried','potato','vegetable','soup','cake', - 'pork','beef','chicken','fish','noodle','sauce','sausage','pumpkin','roast', - 'cream','apple','cheese','meat','mushroom','cherry','yogurt','wedge','sweet', - 'sour','dumpling','cabbage','mash','strawberr','choco','vanilla','tomat','cucumber', - 'salad','onion','pepper','rice','bean','pea','carrot','leek','garlic','chili', - 'spice','herb','salt','butter','milk','egg','pan','casserole','gratin','ragout', - 'goulash','stew','filet','steak','breast','salami','ham','bacon','broccoli', - 'cauliflower','zucchini','eggplant','spinach','asparagus','olive','almond','nut', - 'honey','mustard','vinegar','oil','bread','bun','pancake','ice','tart','dessert', - 'compote','fruit','berry', 'dip', 'danish' -]; - -function splitLanguage(text) { - if (!text) return { de: '', en: '', raw: '' }; - - const raw = text; - const formattedRaw = '• ' + text.replace(/\(([A-Z ]+)\)\s*(?=\S)/g, '($1)\n• '); - - function scoreBlock(wordArray) { - let de = 0, en = 0; - wordArray.forEach(word => { - const w = word.toLowerCase().replace(/[^a-zäöüß]/g, ''); - if (w) { - DE_STEMS.forEach(s => { if (w.includes(s)) de += w.length / s.length; }); - EN_STEMS.forEach(s => { if (w.includes(s)) en += w.length / s.length; }); - } - }); - return { de, en }; - } - - function heuristicSplitEnDe(fragment) { - const words = fragment.trim().split(/\s+/); - if (words.length < 2) return { enPart: fragment, nextDe: '' }; - - let bestK = -1; - let maxScore = -9999; - - for (let k = 1; k < words.length; k++) { - const left = words.slice(0, k); - const right = words.slice(k); - - const leftScore = scoreBlock(left); - const rightScore = scoreBlock(right); - - // Left side is EN, right side is DE. We want EN words on left, DE words on right. - const score = (leftScore.en - leftScore.de) + (rightScore.de - rightScore.en); - - if (score > maxScore) { - maxScore = score; - bestK = k; - } - } - - if (bestK !== -1 && maxScore > 0) { - return { - enPart: words.slice(0, bestK).join(' '), - nextDe: words.slice(bestK).join(' ') - }; - } - return { enPart: fragment, nextDe: '' }; - } - - if (!text.includes(' / ')) { - const words = text.toLowerCase().split(/\s+/); - const score = scoreBlock(words); - if (score.en > score.de) { - return { de: '', en: formattedRaw, raw: formattedRaw }; - } - return { de: formattedRaw, en: '', raw: formattedRaw }; - } - - const parts = text.split(' / '); - if (parts.length > 4) { - return { de: formattedRaw, en: '', raw: formattedRaw }; - } - - const deParts = []; - const enParts = []; - - deParts.push(parts[0].trim()); - - const allergenRegex = /\(([A-Z ]+)\)\s*/; - - for (let i = 1; i < parts.length; i++) { - const fragment = parts[i].trim(); - const match = fragment.match(allergenRegex); - - if (match) { - const allergenEnd = match.index + match[0].length; - const enPart = fragment.substring(0, match.index).trim(); - const allergenCode = match[1]; - const nextDe = fragment.substring(allergenEnd).trim(); - - enParts.push(enPart + '(' + allergenCode + ')'); - if (deParts.length > 0) { - deParts[deParts.length - 1] = deParts[deParts.length - 1] + '(' + allergenCode + ')'; - } - - if (nextDe) { - deParts.push(nextDe); - } - } else { - console.log("No allergen match in:", fragment); - const split = heuristicSplitEnDe(fragment); - console.log("Split:", split); - enParts.push(split.enPart); - if (split.nextDe) { - deParts.push(split.nextDe); - } - } - } - - return { - de: deParts.map(p => '• ' + p).join('\n'), - en: enParts.map(p => '• ' + p).join('\n'), - raw: formattedRaw - }; -} - -console.log(splitLanguage("Hühnersuppe mit Reis / Chicken soup with rice Jambalaya mit Tofu und Joghurtdip / Jambalaya with tofu and yogurt dip Mini Plunder / Mini danishes(ACGHO)"));