google keepのエクスポート形式のhtmlをobsidian形式のmarkdownに変換するpythonスクリプト例

#!/usr/bin/env python3
"""
Google Keep HTML エクスポートを Obsidian Markdown に変換するスクリプト

使い方:
    python html_to_obsidian_md.py input.html [output.md]
    
    output.mdを指定しない場合は、入力ファイル名から自動生成されます
"""
#!/usr/bin/env python3
"""
Google Keep HTML エクスポートを Obsidian Markdown に変換するスクリプト

使い方:
    python html_to_obsidian_md.py input.html [output.md]
    
    output.mdを指定しない場合は、入力ファイル名から自動生成されます
"""

import sys
import re
from pathlib import Path
from html.parser import HTMLParser
from html import unescape
import argparse


class KeepNoteParser(HTMLParser):
    """Google Keep ノートのHTMLをパースするクラス"""
    
    def __init__(self):
        super().__init__()
        self.in_heading = False
        self.in_title = False
        self.in_content = False
        self.in_chips = False
        self.in_chip = False
        self.in_attachments = False
        self.in_meta_icons = False
        
        self.heading = ""
        self.title = ""
        self.content = ""
        self.labels = []
        self.weblinks = []
        self.attachments = []
        self.is_archived = False
        self.is_pinned = False
        self.datetime = ""
        
        self.current_chip_text = ""
        self.current_chip_is_weblink = False
        self.current_chip_is_label = False
        self.current_chip_url = ""
    
    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        
        if tag == 'div':
            if attrs_dict.get('class') == 'heading':
                self.in_heading = True
            elif attrs_dict.get('class') == 'title':
                self.in_title = True
            elif attrs_dict.get('class') == 'content':
                self.in_content = True
            elif attrs_dict.get('class') == 'chips':
                self.in_chips = True
            elif attrs_dict.get('class') == 'attachments':
                self.in_attachments = True
            elif attrs_dict.get('class') == 'meta-icons':
                self.in_meta_icons = True
        
        elif tag == 'span':
            class_attr = attrs_dict.get('class', '')
            title_attr = attrs_dict.get('title', '')
            
            # アーカイブとピン留めの検出
            if 'archived' in class_attr and 'Note archived' in title_attr:
                self.is_archived = True
            elif 'pinned' in class_attr and 'Note pinned' in title_attr:
                self.is_pinned = True
            
            # チップの処理
            if self.in_chips and 'chip' in class_attr:
                self.in_chip = True
                self.current_chip_text = ""
                self.current_chip_url = ""
                
                if 'WEBLINK' in class_attr:
                    self.current_chip_is_weblink = True
                    self.current_chip_is_label = False
                elif 'label' in class_attr:
                    self.current_chip_is_label = True
                    self.current_chip_is_weblink = False
        
        elif tag == 'a' and self.in_chip:
            self.current_chip_url = attrs_dict.get('href', '')
        
        elif tag == 'br' and self.in_content:
            self.content += '\n'
        
        elif tag == 'img' and self.in_attachments:
            src = attrs_dict.get('src', '')
            if src:
                self.attachments.append(src)
    
    def handle_endtag(self, tag):
        if tag == 'div':
            if self.in_meta_icons:
                self.in_meta_icons = False
            elif self.in_title:
                self.in_title = False
            elif self.in_content:
                self.in_content = False
            elif self.in_chips:
                self.in_chips = False
            elif self.in_attachments:
                self.in_attachments = False
            elif self.in_heading:
                self.in_heading = False
        
        elif tag == 'span' and self.in_chip:
            if self.current_chip_is_weblink and self.current_chip_url:
                self.weblinks.append(self.current_chip_url)
            elif self.current_chip_is_label and self.current_chip_text:
                self.labels.append(self.current_chip_text.strip())
            
            self.in_chip = False
            self.current_chip_is_weblink = False
            self.current_chip_is_label = False
    
    def handle_data(self, data):
        if self.in_heading and not self.in_title and not self.in_meta_icons:
            # 日付時刻を抽出（&#8239;はnarrow no-break space = U+202F）
            # 例: Oct 28, 2025, 11:16:42 PM
            date_pattern = '(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)' + r'\s+\d+,\s+\d{4},\s+\d+:\d+:\d+[\s\u202F]+(?:AM|PM)'
            match = re.search(date_pattern, data)
            if match:
                self.datetime = match.group()
            self.heading += data
        elif self.in_title:
            self.title += data
        elif self.in_content:
            self.content += data
        elif self.in_chip:
            self.current_chip_text += data


def set_file_times(filepath: Path, timestamp: float) -> bool:
    """
    ファイルの作成日時と更新日時を設定
    Windows、Linux/Mac の両方に対応
    
    Args:
        filepath: 対象ファイルのパス
        timestamp: Unix timestamp
    
    Returns:
        成功したらTrue、失敗したらFalse
    """
    import platform
    import os
    
    try:
        # Unix系（Linux, Mac）の場合
        if platform.system() != 'Windows':
            # 更新日時とアクセス日時を設定
            os.utime(filepath, (timestamp, timestamp))
            return True
        
        # Windows の場合
        import ctypes
        from ctypes import wintypes
        
        # Windows API 定数
        GENERIC_WRITE = 0x40000000
        OPEN_EXISTING = 3
        FILE_SHARE_READ = 0x00000001
        FILE_SHARE_WRITE = 0x00000002
        
        # datetime から FILETIME に変換
        # FILETIME は 1601年1月1日からの100ナノ秒単位
        EPOCH_AS_FILETIME = 116444736000000000  # 1970年1月1日
        timestamp_100ns = int((timestamp * 10000000) + EPOCH_AS_FILETIME)
        
        # FILETIME 構造体を作成
        class FILETIME(ctypes.Structure):
            _fields_ = [
                ("dwLowDateTime", wintypes.DWORD),
                ("dwHighDateTime", wintypes.DWORD)
            ]
        
        ft = FILETIME()
        ft.dwLowDateTime = timestamp_100ns & 0xFFFFFFFF
        ft.dwHighDateTime = timestamp_100ns >> 32
        
        # Windows API 関数の定義
        CreateFileW = ctypes.windll.kernel32.CreateFileW
        CreateFileW.argtypes = [
            wintypes.LPCWSTR,  # lpFileName
            wintypes.DWORD,    # dwDesiredAccess
            wintypes.DWORD,    # dwShareMode
            wintypes.LPVOID,   # lpSecurityAttributes
            wintypes.DWORD,    # dwCreationDisposition
            wintypes.DWORD,    # dwFlagsAndAttributes
            wintypes.HANDLE    # hTemplateFile
        ]
        CreateFileW.restype = wintypes.HANDLE
        
        SetFileTime = ctypes.windll.kernel32.SetFileTime
        SetFileTime.argtypes = [
            wintypes.HANDLE,            # hFile
            ctypes.POINTER(FILETIME),   # lpCreationTime
            ctypes.POINTER(FILETIME),   # lpLastAccessTime
            ctypes.POINTER(FILETIME)    # lpLastWriteTime
        ]
        SetFileTime.restype = wintypes.BOOL
        
        CloseHandle = ctypes.windll.kernel32.CloseHandle
        CloseHandle.argtypes = [wintypes.HANDLE]
        CloseHandle.restype = wintypes.BOOL
        
        # ファイルを開く
        handle = CreateFileW(
            str(filepath),
            GENERIC_WRITE,
            FILE_SHARE_READ | FILE_SHARE_WRITE,
            None,
            OPEN_EXISTING,
            0,
            None
        )
        
        if handle == -1:  # INVALID_HANDLE_VALUE
            return False
        
        try:
            # 作成日時、アクセス日時、更新日時をすべて設定
            result = SetFileTime(handle, ctypes.byref(ft), ctypes.byref(ft), ctypes.byref(ft))
            return bool(result)
        finally:
            CloseHandle(handle)
            
    except Exception as e:
        print(f"  警告: ファイル日時の設定に失敗しました: {e}", file=sys.stderr)
        return False


def html_to_markdown(html_path: Path) -> str:
    """HTMLファイルをMarkdown形式に変換"""
    
    # HTMLファイルを読み込み
    with open(html_path, 'r', encoding='utf-8') as f:
        html_content = f.read()
    
    # HTMLをパース
    parser = KeepNoteParser()
    parser.feed(html_content)
    
    # Markdownを構築
    markdown_lines = []
    
    # タグ（ハッシュタグ形式）
    tags = []
    
    # アーカイブ/ピン留めタグ
    if parser.is_archived:
        tags.append("#archived")
    if parser.is_pinned:
        tags.append("#pinned")
    
    # ラベルタグ
    for label in parser.labels:
        # スペースを含む場合はアンダースコアに置換
        tag_name = label.replace(' ', '_')
        tags.append(f"#{tag_name}")
    
    if tags:
        markdown_lines.append(" ".join(tags) + "\n\n")
    
    # 本文
    if parser.content:
        # 連続する改行を整理
        content = parser.content.strip()
        # 3つ以上連続する改行を2つに
        content = re.sub(r'\n{3,}', '\n\n', content)
        markdown_lines.append(content + "\n")
    
    # Webリンク
    if parser.weblinks:
        markdown_lines.append("\n## リンク\n\n")
        for link in parser.weblinks:
            markdown_lines.append(f"- {link}\n")
    
    # 添付画像（Markdown形式で表示）
    if parser.attachments:
        markdown_lines.append("\n## 添付ファイル\n\n")
        # 重複を除去
        unique_attachments = list(dict.fromkeys(parser.attachments))
        for attachment in unique_attachments:
            # Markdown画像記法で表示
            markdown_lines.append(f"![{attachment}]({attachment})\n\n")
    
    return "".join(markdown_lines), parser.datetime


def main():
    """メイン処理"""
    
    # コマンドライン引数のパース
    parser = argparse.ArgumentParser(
        description='Google Keep HTMLエクスポートをObsidian Markdownに変換'
    )
    parser.add_argument('input', help='入力HTMLファイルのパス')
    parser.add_argument('output', nargs='?', help='出力Markdownファイルのパス（省略可）')
    parser.add_argument('-f', '--force', action='store_true', 
                       help='出力ファイルが既に存在する場合でも上書き')
    
    args = parser.parse_args()
    
    # 入力ファイルのパス
    input_path = Path(args.input)
    
    if not input_path.exists():
        print(f"エラー: ファイルが見つかりません: {input_path}", file=sys.stderr)
        sys.exit(1)
    
    # 出力ファイルのパス
    if args.output:
        output_path = Path(args.output)
    else:
        # 入力ファイル名から出力ファイル名を生成
        output_path = input_path.with_suffix('.md')
    
    # 既存ファイルの確認
    if output_path.exists() and not args.force:
        print(f"エラー: 出力ファイルが既に存在します: {output_path}", file=sys.stderr)
        print("上書きする場合は -f オプションを使用してください", file=sys.stderr)
        sys.exit(1)
    
    # 変換実行
    try:
        markdown_content, datetime_str = html_to_markdown(input_path)
        
        # Markdownファイルを書き込み
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        # ファイルの作成日時と更新日時を設定
        if datetime_str:
            try:
                # 日付文字列をパース (例: Oct 28, 2025, 11:16:42 PM)
                from datetime import datetime
                import locale
                
                # 英語ロケールを試みる（月名が英語のため）
                try:
                    locale.setlocale(locale.LC_TIME, 'en_US.UTF-8')
                except:
                    try:
                        locale.setlocale(locale.LC_TIME, 'C')
                    except:
                        pass
                
                # datetimeオブジェクトにパース
                dt = datetime.strptime(datetime_str, '%b %d, %Y, %I:%M:%S %p')
                
                # Unix timestampに変換
                timestamp = dt.timestamp()
                
                # ファイルの作成日時と更新日時を設定
                success = set_file_times(output_path, timestamp)
                
                print(f"変換完了: {output_path}")
                print(f"  作成日時: {datetime_str}")
                print(f"  出力サイズ: {len(markdown_content)} 文字")
                if success:
                    print(f"  ファイル日時: 設定成功")
                else:
                    print(f"  ファイル日時: 設定失敗（警告）")
                
            except Exception as e:
                print(f"変換完了: {output_path}")
                print(f"  出力サイズ: {len(markdown_content)} 文字")
                print(f"  警告: ファイル日時の設定に失敗しました: {e}", file=sys.stderr)
        else:
            print(f"変換完了: {output_path}")
            print(f"  出力サイズ: {len(markdown_content)} 文字")
        
    except Exception as e:
        print(f"エラー: 変換中に問題が発生しました: {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == '__main__':
    main()
obsidianのコミュニティープラグイン、custom attachment locationが良さそう。画像貼り付けると自動でノート名と同じディレクトリに格納してノートを移動すると画像フォルダも追随する。vault全体の画像のパスも整理してくれる
参考：[https://note.com/rock72/n/ne69fd99b193d]
google keepのエクスポート形式のhtmlをobsidian形式のmarkdownに変換するpythonスクリプト例

使い方

仕様

スクリプト例

余談