diff --git a/handelsregister.py b/handelsregister.py index 03ccc1a..137ce9b 100755 --- a/handelsregister.py +++ b/handelsregister.py @@ -12,6 +12,7 @@ import sys from bs4 import BeautifulSoup import urllib.parse +import xml.etree.ElementTree as ET # Dictionaries to map arguments to values schlagwortOptionen = { @@ -20,6 +21,24 @@ "exact": 3 } +# XJustiz namespace +NS = {'tns': 'http://www.xjustiz.de'} + +# Role code to label mapping (from xjustiz codeliste:gds.rollenbezeichnung) +ROLE_CODES = { + '086': 'Geschäftsführer(in)', + '087': 'Vorstand', + '285': 'Prokurist(in)', + '287': 'Rechtsträger(in)', + '288': 'Registergericht', + '215': 'Einreicher(in)', + '061': 'Gesellschafter(in)', + '062': 'Kommanditist(in)', + '063': 'Persönlich haftende(r) Gesellschafter(in)', + '089': 'Liquidator(in)', + '297': 'Inhaber(in)', +} + class HandelsRegister: def __init__(self, args): self.args = args @@ -49,7 +68,7 @@ def __init__(self, args): ), ( "Connection", "keep-alive" ), ] - + self.cachedir = pathlib.Path(tempfile.gettempdir()) / "handelsregister_cache" self.cachedir.mkdir(parents=True, exist_ok=True) @@ -96,8 +115,294 @@ def search_company(self): # TODO catch the situation if there's more than one company? # TODO get all documents attached to the exact company # TODO parse useful information out of the PDFs + self._last_search_html = html return get_companies_in_searchresults(html) + def fetch_company_detail(self, result_index=0): + """Fetch the SI (Strukturierter Registerinhalt) document for a search result. + + Must be called after search_company(). Uses the cached search HTML to find + the SI link's PrimeFaces submit params, then submits the form to fetch the + XML document. + + Returns a dict of parsed company detail fields, or None if SI is unavailable. + """ + html = self._last_search_html + soup = BeautifulSoup(html, 'html.parser') + + # Find the SI link for the given result index + si_link = None + for a in soup.find_all('a'): + link_id = a.get('id', '') + # Match pattern: ergebnissForm:selectedSuchErgebnisFormTable:{index}:...:fade_ + if ':{}:'.format(result_index) in link_id: + span = a.find('span') + if span and span.text.strip() == 'SI': + si_link = a + break + + if not si_link: + return None + + onclick = si_link.get('onclick', '') + if not onclick: + return None + + # Parse PrimeFaces.addSubmitParam params from onclick + pairs = re.findall(r"'([^']+)':'([^']*)'", onclick) + if not pairs: + return None + + # Select the ergebnissForm and inject hidden params + self.browser.select_form(name='ergebnissForm') + for key, value in pairs: + self.browser.form.new_control('hidden', key, {'value': value}) + self.browser.form.fixup() + + response_si = self.browser.submit() + si_xml = response_si.read().decode('utf-8') + + # Cache the SI XML + cachename = self.companyname2cachename(self.args.schlagwoerter + '_SI') + with open(cachename, 'w') as f: + f.write(si_xml) + + return parse_si_detail(si_xml) + + +def _build_comment_map(xml_str): + """Build a map from XML element code values to their preceding comments. + + XJustiz uses XML comments to provide human-readable labels for coded values, e.g.: + 221110 + ElementTree drops comments during parsing, so we extract them from the raw string. + """ + comment_map = {} + for match in re.finditer(r'\s*([^<]+)', xml_str): + comment_map[match.group(2)] = match.group(1) + return comment_map + + +def parse_si_detail(xml_str): + """Parse the SI (Strukturierter Registerinhalt) XJustiz XML into a dict. + + The SI document is an XJustiz XML file containing structured register content: + company name, address, legal form, capital, directors, prokura holders, etc. + """ + root = ET.fromstring(xml_str) + comment_map = _build_comment_map(xml_str) + detail = {} + + # Build a role-number-to-person/org mapping from beteiligung entries + roles = {} # rollennummer -> {role_code, role_label, person_data} + for beteiligung in root.findall('.//tns:beteiligung', NS): + rolle_elems = beteiligung.findall('tns:rolle', NS) + beteiligter = beteiligung.find('tns:beteiligter', NS) + if beteiligter is None: + continue + + for rolle in rolle_elems: + rollennummer = _text(rolle, 'tns:rollennummer') + role_code_elem = rolle.find('tns:rollenbezeichnung', NS) + role_code = _text(role_code_elem, 'code') if role_code_elem is not None else None + role_label = ROLE_CODES.get(role_code, comment_map.get(role_code, role_code)) + + person_data = _parse_beteiligter(beteiligter, comment_map) + if rollennummer: + roles[rollennummer] = { + 'role_code': role_code, + 'role_label': role_label, + **person_data + } + + # Company info from the Rechtsträger (role code 287) + for rnum, rdata in roles.items(): + if rdata.get('role_code') == '287': + detail['name'] = rdata.get('name') + detail['legal_form'] = rdata.get('legal_form') + detail['legal_form_code'] = rdata.get('legal_form_code') + detail['seat'] = rdata.get('seat') + if rdata.get('address'): + detail['address'] = rdata['address'] + break + + # basisdatenRegister + basisdaten = root.find('.//tns:basisdatenRegister', NS) + if basisdaten is not None: + # Satzungsdatum + satzung = basisdaten.find('.//tns:aktuellesSatzungsdatum', NS) + if satzung is not None and satzung.text: + detail['articles_of_association_date'] = satzung.text + + # Gegenstand (business purpose) + gegenstand = basisdaten.find('.//tns:gegenstand', NS) + if gegenstand is not None and gegenstand.text: + detail['business_purpose'] = gegenstand.text.strip() + + # Vertretungsregelung (representation rules) + allg_vertretung = basisdaten.find('.//tns:allgemeineVertretungsregelung', NS) + if allg_vertretung is not None: + vb = allg_vertretung.find('.//tns:vertretungsbefugnis', NS) + if vb is not None: + vb_code = _text(vb, 'code') + if vb_code and vb_code in comment_map: + detail['representation_rules'] = comment_map[vb_code] + + # Representatives with their specific rules + for vb_elem in basisdaten.findall('.//tns:vertretungsberechtigte', NS): + ref = _text(vb_elem, 'tns:ref.rollennummer') + if ref and ref in roles: + rep = roles[ref] + besondere = vb_elem.find('tns:besondereVertretungsregelung', NS) + if besondere is not None: + freitext = besondere.find('.//tns:vertretungsbefugnisFreitext', NS) + if freitext is not None and freitext.text: + rep['representation'] = freitext.text.strip().rstrip(';') + else: + vb_code_elem = besondere.find('.//tns:vertretungsbefugnis', NS) + if vb_code_elem is not None: + code_val = _text(vb_code_elem, 'code') + if code_val and code_val in comment_map: + rep['representation'] = comment_map[code_val] + + befreiung = besondere.find('.//tns:befreiungVon181BGB', NS) + if befreiung is not None: + rep['exempt_181_bgb'] = True + + # Collect managing directors and prokura holders + directors = [] + prokura = [] + for rnum, rdata in roles.items(): + if rdata.get('role_code') == '086': # Geschäftsführer + directors.append(rdata) + elif rdata.get('role_code') == '087': # Vorstand + directors.append(rdata) + elif rdata.get('role_code') == '285': # Prokurist + prokura.append(rdata) + + if directors: + detail['directors'] = directors + if prokura: + detail['prokura'] = prokura + + # Capital (Stammkapital / Grundkapital) + stammkapital = root.find('.//tns:stammkapital', NS) + if stammkapital is None: + stammkapital = root.find('.//tns:grundkapital', NS) + if stammkapital is not None: + zahl = _text(stammkapital, 'tns:zahl') + waehrung_elem = stammkapital.find('.//tns:waehrung', NS) + waehrung_code = _text(waehrung_elem, 'code') if waehrung_elem is not None else None + if zahl: + detail['capital'] = { + 'amount': zahl, + 'currency': waehrung_code or 'EUR' + } + + # Register info + aktenzeichen = root.find('.//tns:aktenzeichen.strukturiert', NS) + if aktenzeichen is not None: + register = aktenzeichen.find('tns:register', NS) + nummer = _text(aktenzeichen, 'tns:laufendeNummer') + if register is not None and nummer: + reg_code = _text(register, 'code') + detail['register_type'] = reg_code + detail['register_number'] = nummer + + # Auszug metadata + auszug = root.find('.//tns:auszug', NS) + if auszug is not None: + detail['retrieval_date'] = _text(auszug, 'tns:abrufdatum') + detail['last_entry_date'] = _text(auszug, 'tns:letzteEintragung') + detail['num_entries'] = _text(auszug, 'tns:anzahlEintragungen') + + # Eintragungstexte (register entry texts) + entries = [] + for et_elem in root.findall('.//tns:eintragungstext', NS): + entry = {} + entry['column'] = _text(et_elem, 'tns:spalte') + entry['position'] = _text(et_elem, 'tns:position') + entry['number'] = _text(et_elem, 'tns:laufendeNummer') + entry['text'] = _text(et_elem, 'tns:text') + art_elem = et_elem.find('tns:eintragungsart', NS) + if art_elem is not None: + art_code = _text(art_elem, 'code') + if art_code and art_code in comment_map: + entry['type'] = comment_map[art_code] + entries.append(entry) + if entries: + detail['register_entries'] = entries + + return detail + + +def _text(parent, path): + """Extract text from an XML element, or None.""" + if parent is None: + return None + elem = parent.find(path, NS) + if elem is not None and elem.text: + return elem.text.strip() + return None + + +def _parse_beteiligter(beteiligter, comment_map): + """Parse a beteiligter element into a person/org dict.""" + data = {} + + # Natural person + person = beteiligter.find('.//tns:natuerlichePerson', NS) + if person is not None: + vorname = _text(person, './/tns:vorname') + nachname = _text(person, './/tns:nachname') + if vorname and nachname: + data['name'] = '%s %s' % (vorname, nachname) + elif nachname: + data['name'] = nachname + + geburtsdatum = _text(person, './/tns:geburtsdatum') + if geburtsdatum: + data['date_of_birth'] = geburtsdatum + + anschrift = person.find('tns:anschrift', NS) + if anschrift is not None: + data['city'] = _text(anschrift, 'tns:ort') + + return data + + # Organisation + org = beteiligter.find('.//tns:organisation', NS) + if org is not None: + data['name'] = _text(org, './/tns:bezeichnung.aktuell') + + rechtsform = org.find('.//tns:rechtsform', NS) + if rechtsform is not None: + rf_code = _text(rechtsform, 'code') + if rf_code: + data['legal_form'] = comment_map.get(rf_code, rf_code) + data['legal_form_code'] = rf_code + + sitz = org.find('tns:sitz', NS) + if sitz is not None: + data['seat'] = _text(sitz, 'tns:ort') + + anschrift = org.find('tns:anschrift', NS) + if anschrift is not None: + addr = {} + street = _text(anschrift, 'tns:strasse') + hausnummer = _text(anschrift, 'tns:hausnummer') + if street: + addr['street'] = street + (' ' + hausnummer if hausnummer else '') + plz = _text(anschrift, 'tns:postleitzahl') + if plz: + addr['postal_code'] = plz + ort = _text(anschrift, 'tns:ort') + if ort: + addr['city'] = ort + if addr: + data['address'] = addr + + return data def parse_result(result): @@ -106,7 +411,7 @@ def parse_result(result): cells.append(cell.text.strip()) d = {} d['court'] = cells[1] - + # Extract register number: HRB, HRA, VR, GnR followed by numbers (e.g. HRB 12345, VR 6789) # Also capture suffix letter if present (e.g. HRB 12345 B), but avoid matching start of words (e.g. " Formerly") reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+(\s+[A-Z])?(?!\w)', d['court']) @@ -147,10 +452,65 @@ def pr_company_info(c): for name, loc in c.get('history'): print(name, loc) +def pr_company_detail(detail): + """Print the SI detail fields in human-readable format.""" + if not detail: + print(' (no detail data available)') + return + + print() + print('--- Detail (SI) ---') + for key in ('name', 'legal_form', 'seat', 'business_purpose', + 'articles_of_association_date', 'representation_rules'): + if key in detail: + label = key.replace('_', ' ').title() + print('%s: %s' % (label, detail[key])) + + if 'address' in detail: + addr = detail['address'] + parts = [] + if 'street' in addr: + parts.append(addr['street']) + if 'postal_code' in addr and 'city' in addr: + parts.append('%s %s' % (addr['postal_code'], addr['city'])) + elif 'city' in addr: + parts.append(addr['city']) + print('Address: %s' % ', '.join(parts)) + + if 'capital' in detail: + cap = detail['capital'] + print('Capital: %s %s' % (cap['amount'], cap['currency'])) + + if 'directors' in detail: + print('Directors:') + for d in detail['directors']: + extra = '' + if d.get('representation'): + extra = ' (%s)' % d['representation'] + print(' %s%s' % (d.get('name', '?'), extra)) + + if 'prokura' in detail: + print('Prokura:') + for p in detail['prokura']: + extra = '' + if p.get('representation'): + extra = ' (%s)' % p['representation'] + print(' %s%s' % (p.get('name', '?'), extra)) + + if 'register_entries' in detail: + print('Register Entries:') + for entry in detail['register_entries']: + etype = entry.get('type', '') + text = entry.get('text', '') + print(' [%s] %s' % (etype, text)) + + print('Last Entry: %s' % detail.get('last_entry_date', '-')) + print('Retrieval Date: %s' % detail.get('retrieval_date', '-')) + def get_companies_in_searchresults(html): soup = BeautifulSoup(html, 'html.parser') grid = soup.find('table', role='grid') - + results = [] for result in grid.find_all('tr'): a = result.get('data-ri') @@ -195,6 +555,12 @@ def parse_args(): help="Return response as JSON", action="store_true" ) + parser.add_argument( + "-det", + "--detail", + help="Fetch SI (structured register content) detail for the first search result", + action="store_true" + ) args = parser.parse_args() @@ -214,8 +580,17 @@ def parse_args(): h.open_startpage() companies = h.search_company() if companies is not None: + detail = None + if args.detail and len(companies) > 0: + detail = h.fetch_company_detail(result_index=0) + if args.json: - print(json.dumps(companies)) + output = companies + if detail: + output[0]['detail'] = detail + print(json.dumps(output)) else: for c in companies: pr_company_info(c) + if detail: + pr_company_detail(detail) diff --git a/test_handelsregister.py b/test_handelsregister.py index fa1951a..9b6a717 100644 --- a/test_handelsregister.py +++ b/test_handelsregister.py @@ -1,12 +1,12 @@ import pytest -from handelsregister import get_companies_in_searchresults,HandelsRegister +from handelsregister import get_companies_in_searchresults,HandelsRegister,parse_si_detail import argparse def test_parse_search_result(): html = '%s' % """
Berlin District court Berlin (Charlottenburg) HRB 44343
GASAG AGBerlincurrently registered
History
1.) Gasag Berliner Gaswerke Aktiengesellschaft1.) Berlin
""" res = get_companies_in_searchresults(html) assert res == [{ - 'court':'Berlin District court Berlin (Charlottenburg) HRB 44343', + 'court':'Berlin District court Berlin (Charlottenburg) HRB 44343', 'register_num': 'HRB 44343 B', 'name':'GASAG AG', 'state':'Berlin', @@ -17,6 +17,75 @@ def test_parse_search_result(): },] +def test_parse_si_detail(): + """Test parsing of SI (Strukturierter Registerinhalt) XJustiz XML.""" + si_xml = """unbekannt2026-02-25T11:21:36ZU1308test-id002RegisSTARTest1.00U1308HRB32007Strukturierter Registerinhalt114536528302871Test Company GmbH221110Leipzig003Teststraße4204109Leipzig000414533251500864MaxMustermann1984-06-25017Leipzig514543556802855ErikaMusterfrau1990-01-15017Berlin0021612009Gesellschaftsvertrag vom 18.06.2015.11:21:362026-02-252025-06-0272025-05-222015-06-18Test Company GmbH221110Leipzig003Teststraße4204109Leipzig0000664einzelvertretungsberechtigt;0115002Softwareentwicklung und IT-Beratung.25000.00EUR""" + + detail = parse_si_detail(si_xml) + + # Company info + assert detail['name'] == 'Test Company GmbH' + assert 'GmbH' in detail['legal_form'] + assert detail['seat'] == 'Leipzig' + + # Address + assert detail['address']['street'] == 'Teststraße 42' + assert detail['address']['postal_code'] == '04109' + assert detail['address']['city'] == 'Leipzig' + + # Business purpose + assert 'Softwareentwicklung' in detail['business_purpose'] + + # Capital + assert detail['capital']['amount'] == '25000.00' + assert detail['capital']['currency'] == 'EUR' + + # Directors + assert len(detail['directors']) == 1 + assert detail['directors'][0]['name'] == 'Max Mustermann' + assert detail['directors'][0]['date_of_birth'] == '1984-06-25' + + # Prokura + assert len(detail['prokura']) == 1 + assert detail['prokura'][0]['name'] == 'Erika Musterfrau' + + # Register info + assert detail['register_type'] == 'HRB' + assert detail['register_number'] == '32007' + assert detail['retrieval_date'] == '2026-02-25' + assert detail['last_entry_date'] == '2025-06-02' + + # Articles of association + assert detail['articles_of_association_date'] == '2015-06-18' + + # Register entries + assert len(detail['register_entries']) >= 1 + assert 'Gesellschaftsvertrag' in detail['register_entries'][0]['text'] + + +def test_fetch_detail(): + """Integration test: search for a known company and fetch its SI data.""" + args = argparse.Namespace(debug=False, force=True, schlagwoerter='Gecko Two GmbH', + schlagwortOptionen='exact', json=False, detail=True) + h = HandelsRegister(args) + h.open_startpage() + companies = h.search_company() + assert companies is not None + assert len(companies) > 0 + + detail = h.fetch_company_detail(result_index=0) + assert detail is not None + + # Basic fields that should always be present + assert detail.get('name') is not None + assert 'Gecko' in detail['name'] + assert detail.get('seat') is not None + assert detail.get('address') is not None + assert detail.get('capital') is not None + assert detail.get('directors') is not None + assert len(detail['directors']) > 0 + + @pytest.mark.parametrize("company, state_id", [ ("Hafen Hamburg", "Hamburg"), ("Bayerische Motoren Werke", "Bayern"), @@ -37,7 +106,7 @@ def test_parse_search_result(): ]) def test_search_by_state_company(company, state_id): - args = argparse.Namespace(debug=False, force=True, schlagwoerter=company, schlagwortOptionen='all', json=False) + args = argparse.Namespace(debug=False, force=True, schlagwoerter=company, schlagwortOptionen='all', json=False, detail=False) h = HandelsRegister(args) h.open_startpage() companies = h.search_company() @@ -45,13 +114,13 @@ def test_search_by_state_company(company, state_id): assert len(companies) > 0 def test_haus_anker_b_suffix(): - args = argparse.Namespace(debug=False, force=True, schlagwoerter='Haus-Anker Verwaltungs GmbH', schlagwortOptionen='exact', json=False) + args = argparse.Namespace(debug=False, force=True, schlagwoerter='Haus-Anker Verwaltungs GmbH', schlagwortOptionen='exact', json=False, detail=False) h = HandelsRegister(args) h.open_startpage() companies = h.search_company() assert companies is not None - + target_company = next((c for c in companies if '138434' in c['register_num']), None) - + assert target_company is not None, "Haus-Anker Verwaltungs GmbH with expected number not found" - assert target_company['register_num'] == 'HRB 138434 B' \ No newline at end of file + assert target_company['register_num'] == 'HRB 138434 B'